1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetLowering.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 /// Call this when the user attempts to do something unsupported, like
84 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
85 /// report_fatal_error, so calling code should attempt to recover without
87 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
89 MachineFunction &MF = DAG.getMachineFunction();
90 DAG.getContext()->diagnose(
91 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
94 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
95 const X86Subtarget &STI)
96 : TargetLowering(TM), Subtarget(STI) {
97 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
98 X86ScalarSSEf64 = Subtarget.hasSSE2();
99 X86ScalarSSEf32 = Subtarget.hasSSE1();
100 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
102 // Set up the TargetLowering object.
104 // X86 is weird. It always uses i8 for shift amounts and setcc results.
105 setBooleanContents(ZeroOrOneBooleanContent);
106 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
107 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
109 // For 64-bit, since we have so many registers, use the ILP scheduler.
110 // For 32-bit, use the register pressure specific scheduling.
111 // For Atom, always use ILP scheduling.
112 if (Subtarget.isAtom())
113 setSchedulingPreference(Sched::ILP);
114 else if (Subtarget.is64Bit())
115 setSchedulingPreference(Sched::ILP);
117 setSchedulingPreference(Sched::RegPressure);
118 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
119 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
121 // Bypass expensive divides and use cheaper ones.
122 if (TM.getOptLevel() >= CodeGenOpt::Default) {
123 if (Subtarget.hasSlowDivide32())
124 addBypassSlowDiv(32, 8);
125 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
126 addBypassSlowDiv(64, 32);
129 if (Subtarget.isTargetKnownWindowsMSVC() ||
130 Subtarget.isTargetWindowsItanium()) {
131 // Setup Windows compiler runtime calls.
132 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
133 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
134 setLibcallName(RTLIB::SREM_I64, "_allrem");
135 setLibcallName(RTLIB::UREM_I64, "_aullrem");
136 setLibcallName(RTLIB::MUL_I64, "_allmul");
137 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
138 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
139 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
140 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
141 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
144 if (Subtarget.isTargetDarwin()) {
145 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
146 setUseUnderscoreSetJmp(false);
147 setUseUnderscoreLongJmp(false);
148 } else if (Subtarget.isTargetWindowsGNU()) {
149 // MS runtime is weird: it exports _setjmp, but longjmp!
150 setUseUnderscoreSetJmp(true);
151 setUseUnderscoreLongJmp(false);
153 setUseUnderscoreSetJmp(true);
154 setUseUnderscoreLongJmp(true);
157 // Set up the register classes.
158 addRegisterClass(MVT::i8, &X86::GR8RegClass);
159 addRegisterClass(MVT::i16, &X86::GR16RegClass);
160 addRegisterClass(MVT::i32, &X86::GR32RegClass);
161 if (Subtarget.is64Bit())
162 addRegisterClass(MVT::i64, &X86::GR64RegClass);
164 for (MVT VT : MVT::integer_valuetypes())
165 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
167 // We don't accept any truncstore of integer registers.
168 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
169 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
170 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
171 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
172 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
173 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
175 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
177 // SETOEQ and SETUNE require checking two conditions.
178 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
179 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
180 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
181 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
182 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
183 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
185 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
187 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
188 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
189 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
191 if (Subtarget.is64Bit()) {
192 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
193 // f32/f64 are legal, f80 is custom.
194 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
196 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
197 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
198 } else if (!Subtarget.useSoftFloat()) {
199 // We have an algorithm for SSE2->double, and we turn this into a
200 // 64-bit FILD followed by conditional FADD for other targets.
201 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
202 // We have an algorithm for SSE2, and we turn this into a 64-bit
203 // FILD or VCVTUSI2SS/SD for other targets.
204 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
207 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
209 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
210 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
212 if (!Subtarget.useSoftFloat()) {
213 // SSE has no i16 to fp conversion, only i32.
214 if (X86ScalarSSEf32) {
215 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
216 // f32 and f64 cases are Legal, f80 case is not
217 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
219 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
220 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
223 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
227 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
229 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
230 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
232 if (!Subtarget.useSoftFloat()) {
233 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
234 // are Legal, f80 is custom lowered.
235 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
238 if (X86ScalarSSEf32) {
239 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
240 // f32 and f64 cases are Legal, f80 case is not
241 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
243 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
247 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
248 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
252 // Handle FP_TO_UINT by promoting the destination to a larger signed
254 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
255 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
256 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
258 if (Subtarget.is64Bit()) {
259 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
260 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
261 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
262 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
264 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
265 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
267 } else if (!Subtarget.useSoftFloat()) {
268 // Since AVX is a superset of SSE3, only check for SSE here.
269 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
270 // Expand FP_TO_UINT into a select.
271 // FIXME: We would like to use a Custom expander here eventually to do
272 // the optimal thing for SSE vs. the default expansion in the legalizer.
273 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
275 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
276 // With SSE3 we can use fisttpll to convert to a signed i64; without
277 // SSE, we're stuck with a fistpll.
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
283 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
284 if (!X86ScalarSSEf64) {
285 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
286 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
287 if (Subtarget.is64Bit()) {
288 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
289 // Without SSE, i64->f64 goes through memory.
290 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
292 } else if (!Subtarget.is64Bit())
293 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
295 // Scalar integer divide and remainder are lowered to use operations that
296 // produce two results, to match the available instructions. This exposes
297 // the two-result form to trivial CSE, which is able to combine x/y and x%y
298 // into a single instruction.
300 // Scalar integer multiply-high is also lowered to use two-result
301 // operations, to match the available instructions. However, plain multiply
302 // (low) operations are left as Legal, as there are single-result
303 // instructions for this in x86. Using the two-result multiply instructions
304 // when both high and low results are needed must be arranged by dagcombine.
305 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
306 setOperationAction(ISD::MULHS, VT, Expand);
307 setOperationAction(ISD::MULHU, VT, Expand);
308 setOperationAction(ISD::SDIV, VT, Expand);
309 setOperationAction(ISD::UDIV, VT, Expand);
310 setOperationAction(ISD::SREM, VT, Expand);
311 setOperationAction(ISD::UREM, VT, Expand);
314 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
315 if (VT == MVT::i64 && !Subtarget.is64Bit())
317 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
318 setOperationAction(ISD::ADDC, VT, Custom);
319 setOperationAction(ISD::ADDE, VT, Custom);
320 setOperationAction(ISD::SUBC, VT, Custom);
321 setOperationAction(ISD::SUBE, VT, Custom);
324 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
325 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
326 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
327 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
328 setOperationAction(ISD::BR_CC, VT, Expand);
329 setOperationAction(ISD::SELECT_CC, VT, Expand);
331 if (Subtarget.is64Bit())
332 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
333 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
334 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
335 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
336 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
338 setOperationAction(ISD::FREM , MVT::f32 , Expand);
339 setOperationAction(ISD::FREM , MVT::f64 , Expand);
340 setOperationAction(ISD::FREM , MVT::f80 , Expand);
341 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
343 // Promote the i8 variants and force them on up to i32 which has a shorter
345 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
346 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
347 if (!Subtarget.hasBMI()) {
348 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
349 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
351 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
352 if (Subtarget.is64Bit()) {
353 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
358 if (Subtarget.hasLZCNT()) {
359 // When promoting the i8 variants, force them to i32 for a shorter
361 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
362 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
364 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
365 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
366 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
367 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
369 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
370 if (Subtarget.is64Bit()) {
371 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
376 // Special handling for half-precision floating point conversions.
377 // If we don't have F16C support, then lower half float conversions
378 // into library calls.
379 if (Subtarget.useSoftFloat() ||
380 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
381 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
382 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
385 // There's never any support for operations beyond MVT::f32.
386 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
387 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
388 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
389 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
391 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
392 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
393 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
394 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
395 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
396 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
398 if (Subtarget.hasPOPCNT()) {
399 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
401 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
402 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
403 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
404 if (Subtarget.is64Bit())
405 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
408 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
410 if (!Subtarget.hasMOVBE())
411 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
413 // These should be promoted to a larger select which is supported.
414 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
415 // X86 wants to expand cmov itself.
416 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
417 setOperationAction(ISD::SELECT, VT, Custom);
418 setOperationAction(ISD::SETCC, VT, Custom);
420 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::SELECT, VT, Custom);
424 setOperationAction(ISD::SETCC, VT, Custom);
425 setOperationAction(ISD::SETCCE, VT, Custom);
427 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
428 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
429 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
430 // support continuation, user-level threading, and etc.. As a result, no
431 // other SjLj exception interfaces are implemented and please don't build
432 // your own exception handling based on them.
433 // LLVM/Clang supports zero-cost DWARF exception handling.
434 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
435 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
436 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
437 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
438 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
441 for (auto VT : { MVT::i32, MVT::i64 }) {
442 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 setOperationAction(ISD::ConstantPool , VT, Custom);
445 setOperationAction(ISD::JumpTable , VT, Custom);
446 setOperationAction(ISD::GlobalAddress , VT, Custom);
447 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
448 setOperationAction(ISD::ExternalSymbol , VT, Custom);
449 setOperationAction(ISD::BlockAddress , VT, Custom);
452 // 64-bit shl, sra, srl (iff 32-bit x86)
453 for (auto VT : { MVT::i32, MVT::i64 }) {
454 if (VT == MVT::i64 && !Subtarget.is64Bit())
456 setOperationAction(ISD::SHL_PARTS, VT, Custom);
457 setOperationAction(ISD::SRA_PARTS, VT, Custom);
458 setOperationAction(ISD::SRL_PARTS, VT, Custom);
461 if (Subtarget.hasSSE1())
462 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
464 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
466 // Expand certain atomics
467 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
468 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
469 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
470 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
471 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
474 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
477 if (Subtarget.hasCmpxchg16b()) {
478 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
481 // FIXME - use subtarget debug flags
482 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
483 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
484 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
485 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
488 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
489 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
491 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
492 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
494 setOperationAction(ISD::TRAP, MVT::Other, Legal);
495 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
497 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
498 setOperationAction(ISD::VASTART , MVT::Other, Custom);
499 setOperationAction(ISD::VAEND , MVT::Other, Expand);
500 bool Is64Bit = Subtarget.is64Bit();
501 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
502 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
504 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
505 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
507 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
509 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
510 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
511 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
513 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
514 // f32 and f64 use SSE.
515 // Set up the FP register classes.
516 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
517 : &X86::FR32RegClass);
518 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
519 : &X86::FR64RegClass);
521 for (auto VT : { MVT::f32, MVT::f64 }) {
522 // Use ANDPD to simulate FABS.
523 setOperationAction(ISD::FABS, VT, Custom);
525 // Use XORP to simulate FNEG.
526 setOperationAction(ISD::FNEG, VT, Custom);
528 // Use ANDPD and ORPD to simulate FCOPYSIGN.
529 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
531 // We don't support sin/cos/fmod
532 setOperationAction(ISD::FSIN , VT, Expand);
533 setOperationAction(ISD::FCOS , VT, Expand);
534 setOperationAction(ISD::FSINCOS, VT, Expand);
537 // Lower this to MOVMSK plus an AND.
538 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
539 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
541 // Expand FP immediates into loads from the stack, except for the special
543 addLegalFPImmediate(APFloat(+0.0)); // xorpd
544 addLegalFPImmediate(APFloat(+0.0f)); // xorps
545 } else if (UseX87 && X86ScalarSSEf32) {
546 // Use SSE for f32, x87 for f64.
547 // Set up the FP register classes.
548 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
549 : &X86::FR32RegClass);
550 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
552 // Use ANDPS to simulate FABS.
553 setOperationAction(ISD::FABS , MVT::f32, Custom);
555 // Use XORP to simulate FNEG.
556 setOperationAction(ISD::FNEG , MVT::f32, Custom);
558 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
560 // Use ANDPS and ORPS to simulate FCOPYSIGN.
561 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
562 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
564 // We don't support sin/cos/fmod
565 setOperationAction(ISD::FSIN , MVT::f32, Expand);
566 setOperationAction(ISD::FCOS , MVT::f32, Expand);
567 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
569 // Special cases we handle for FP constants.
570 addLegalFPImmediate(APFloat(+0.0f)); // xorps
571 addLegalFPImmediate(APFloat(+0.0)); // FLD0
572 addLegalFPImmediate(APFloat(+1.0)); // FLD1
573 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
574 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
576 if (!TM.Options.UnsafeFPMath) {
577 setOperationAction(ISD::FSIN , MVT::f64, Expand);
578 setOperationAction(ISD::FCOS , MVT::f64, Expand);
579 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
582 // f32 and f64 in x87.
583 // Set up the FP register classes.
584 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
585 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
587 for (auto VT : { MVT::f32, MVT::f64 }) {
588 setOperationAction(ISD::UNDEF, VT, Expand);
589 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
591 if (!TM.Options.UnsafeFPMath) {
592 setOperationAction(ISD::FSIN , VT, Expand);
593 setOperationAction(ISD::FCOS , VT, Expand);
594 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 if (!TM.Options.UnsafeFPMath) {
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
645 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
646 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
647 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
648 setOperationAction(ISD::FRINT, MVT::f80, Expand);
649 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
650 setOperationAction(ISD::FMA, MVT::f80, Expand);
653 // Always use a library call for pow.
654 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
658 setOperationAction(ISD::FLOG, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
663 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
664 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
666 // Some FP actions are always expanded for vector types.
667 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
668 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
669 setOperationAction(ISD::FSIN, VT, Expand);
670 setOperationAction(ISD::FSINCOS, VT, Expand);
671 setOperationAction(ISD::FCOS, VT, Expand);
672 setOperationAction(ISD::FREM, VT, Expand);
673 setOperationAction(ISD::FPOWI, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
797 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
798 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
801 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
804 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
805 setOperationAction(ISD::SETCC, VT, Custom);
806 setOperationAction(ISD::CTPOP, VT, Custom);
807 setOperationAction(ISD::CTTZ, VT, Custom);
810 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
811 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
812 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
813 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
814 setOperationAction(ISD::VSELECT, VT, Custom);
815 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
818 // We support custom legalizing of sext and anyext loads for specific
819 // memory vector types which we can load as a scalar (or sequence of
820 // scalars) and extend in-register to a legal 128-bit vector type. For sext
821 // loads these must work with a single scalar load.
822 for (MVT VT : MVT::integer_vector_valuetypes()) {
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
824 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
825 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
831 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
834 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
835 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
836 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
837 setOperationAction(ISD::VSELECT, VT, Custom);
839 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
842 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
843 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
846 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
847 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
848 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
851 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
852 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
855 // Custom lower v2i64 and v2f64 selects.
856 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
857 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
859 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
860 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
862 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
863 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
866 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
867 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
869 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
870 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
872 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
873 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
875 for (MVT VT : MVT::fp_vector_valuetypes())
876 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
878 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
879 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
880 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
882 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
883 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
884 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
886 // In the customized shift lowering, the legal v4i32/v2i64 cases
887 // in AVX2 will be recognized.
888 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
889 setOperationAction(ISD::SRL, VT, Custom);
890 setOperationAction(ISD::SHL, VT, Custom);
891 setOperationAction(ISD::SRA, VT, Custom);
895 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
896 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
897 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
898 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
899 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
900 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
901 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
902 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
903 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
906 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
907 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
908 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
909 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
910 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
911 setOperationAction(ISD::FRINT, RoundedTy, Legal);
912 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
915 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
919 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
920 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
921 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
922 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
924 // FIXME: Do we need to handle scalar-to-vector here?
925 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
927 // We directly match byte blends in the backend as they match the VSELECT
929 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
931 // SSE41 brings specific instructions for doing vector sign extend even in
932 // cases where we don't have SRA.
933 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
934 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
935 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
938 for (MVT VT : MVT::integer_vector_valuetypes()) {
939 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
940 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
941 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
944 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
945 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
946 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
949 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
950 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
951 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
954 // i8 vectors are custom because the source register and source
955 // source memory operand types are not the same width.
956 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
959 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::ROTL, VT, Custom);
964 // XOP can efficiently perform BITREVERSE with VPPERM.
965 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
966 setOperationAction(ISD::BITREVERSE, VT, Custom);
968 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
969 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
970 setOperationAction(ISD::BITREVERSE, VT, Custom);
973 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
974 bool HasInt256 = Subtarget.hasInt256();
976 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
980 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
981 : &X86::VR256RegClass);
982 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
983 : &X86::VR256RegClass);
984 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
985 : &X86::VR256RegClass);
986 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
987 : &X86::VR256RegClass);
989 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
990 setOperationAction(ISD::FFLOOR, VT, Legal);
991 setOperationAction(ISD::FCEIL, VT, Legal);
992 setOperationAction(ISD::FTRUNC, VT, Legal);
993 setOperationAction(ISD::FRINT, VT, Legal);
994 setOperationAction(ISD::FNEARBYINT, VT, Legal);
995 setOperationAction(ISD::FNEG, VT, Custom);
996 setOperationAction(ISD::FABS, VT, Custom);
997 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1000 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1001 // even though v8i16 is a legal type.
1002 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1003 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1004 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1006 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1007 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1008 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1010 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1011 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1013 for (MVT VT : MVT::fp_vector_valuetypes())
1014 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1016 // In the customized shift lowering, the legal v8i32/v4i64 cases
1017 // in AVX2 will be recognized.
1018 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1019 setOperationAction(ISD::SRL, VT, Custom);
1020 setOperationAction(ISD::SHL, VT, Custom);
1021 setOperationAction(ISD::SRA, VT, Custom);
1024 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1025 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1026 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1028 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1029 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1030 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1035 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1036 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1037 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1039 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1040 setOperationAction(ISD::SETCC, VT, Custom);
1041 setOperationAction(ISD::CTPOP, VT, Custom);
1042 setOperationAction(ISD::CTTZ, VT, Custom);
1043 setOperationAction(ISD::CTLZ, VT, Custom);
1046 if (Subtarget.hasAnyFMA()) {
1047 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1048 MVT::v2f64, MVT::v4f64 })
1049 setOperationAction(ISD::FMA, VT, Legal);
1052 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1053 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1058 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1062 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1063 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1065 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1068 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1070 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1071 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1073 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1074 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1079 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1080 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1081 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1083 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1084 // when we have a 256bit-wide blend with immediate.
1085 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1087 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1088 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1089 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1090 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1091 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1092 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1093 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1094 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1098 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1099 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1100 setOperationAction(ISD::MLOAD, VT, Legal);
1101 setOperationAction(ISD::MSTORE, VT, Legal);
1104 // Extract subvector is special because the value type
1105 // (result) is 128-bit but the source is 256-bit wide.
1106 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1107 MVT::v4f32, MVT::v2f64 }) {
1108 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1111 // Custom lower several nodes for 256-bit types.
1112 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1113 MVT::v8f32, MVT::v4f64 }) {
1114 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1115 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1116 setOperationAction(ISD::VSELECT, VT, Custom);
1117 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1118 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1119 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1120 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1121 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1125 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1127 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1128 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1129 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1130 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1131 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1132 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1133 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1137 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1138 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1139 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1140 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1141 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1144 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1145 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1147 for (MVT VT : MVT::fp_vector_valuetypes())
1148 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1150 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1151 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1152 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1153 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1154 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1155 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1156 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1159 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1160 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1161 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1162 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1163 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1164 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1165 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1166 setTruncStoreAction(VT, MaskVT, Custom);
1169 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1170 setOperationAction(ISD::FNEG, VT, Custom);
1171 setOperationAction(ISD::FABS, VT, Custom);
1172 setOperationAction(ISD::FMA, VT, Legal);
1173 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1176 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1179 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1180 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1185 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1190 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1191 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1193 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1194 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1195 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1196 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1197 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1198 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1199 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1200 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1203 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1204 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1205 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1206 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1207 if (Subtarget.hasVLX()){
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1209 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1210 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1211 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1212 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1215 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1216 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1217 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1218 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1220 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1221 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1222 setOperationAction(ISD::MLOAD, VT, Custom);
1223 setOperationAction(ISD::MSTORE, VT, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1227 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1229 if (Subtarget.hasDQI()) {
1230 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1231 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1232 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1233 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1234 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1236 if (Subtarget.hasVLX()) {
1237 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1238 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1239 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1240 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1243 if (Subtarget.hasVLX()) {
1244 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1245 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1246 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1247 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1248 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1249 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1250 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1251 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1252 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1254 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1256 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1262 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1264 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1265 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1266 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1269 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1270 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1271 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1272 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1273 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1276 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1278 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1280 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281 setOperationAction(ISD::FFLOOR, VT, Legal);
1282 setOperationAction(ISD::FCEIL, VT, Legal);
1283 setOperationAction(ISD::FTRUNC, VT, Legal);
1284 setOperationAction(ISD::FRINT, VT, Legal);
1285 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1288 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1289 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1291 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1292 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1293 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1295 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1296 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1297 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1298 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1299 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1301 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1303 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1304 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1305 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1306 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1307 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1308 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1310 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1312 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1313 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1314 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1316 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1317 setOperationAction(ISD::ADD, VT, Custom);
1318 setOperationAction(ISD::SUB, VT, Custom);
1319 setOperationAction(ISD::MUL, VT, Custom);
1320 setOperationAction(ISD::SETCC, VT, Custom);
1321 setOperationAction(ISD::SELECT, VT, Custom);
1322 setOperationAction(ISD::TRUNCATE, VT, Custom);
1324 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1325 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1326 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1327 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1328 setOperationAction(ISD::VSELECT, VT, Expand);
1331 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1332 setOperationAction(ISD::SMAX, VT, Legal);
1333 setOperationAction(ISD::UMAX, VT, Legal);
1334 setOperationAction(ISD::SMIN, VT, Legal);
1335 setOperationAction(ISD::UMIN, VT, Legal);
1336 setOperationAction(ISD::ABS, VT, Legal);
1337 setOperationAction(ISD::SRL, VT, Custom);
1338 setOperationAction(ISD::SHL, VT, Custom);
1339 setOperationAction(ISD::SRA, VT, Custom);
1340 setOperationAction(ISD::CTPOP, VT, Custom);
1341 setOperationAction(ISD::CTTZ, VT, Custom);
1344 // Need to promote to 64-bit even though we have 32-bit masked instructions
1345 // because the IR optimizers rearrange bitcasts around logic ops leaving
1346 // too many variations to handle if we don't promote them.
1347 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1348 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1349 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1351 if (Subtarget.hasCDI()) {
1352 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1353 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1354 MVT::v4i64, MVT::v8i64}) {
1355 setOperationAction(ISD::CTLZ, VT, Legal);
1356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1358 } // Subtarget.hasCDI()
1360 if (Subtarget.hasDQI()) {
1361 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1362 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1363 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1364 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1367 // Custom lower several nodes.
1368 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1369 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1370 setOperationAction(ISD::MGATHER, VT, Custom);
1371 setOperationAction(ISD::MSCATTER, VT, Custom);
1373 // Extract subvector is special because the value type
1374 // (result) is 256-bit but the source is 512-bit wide.
1375 // 128-bit was made Custom under AVX1.
1376 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1377 MVT::v8f32, MVT::v4f64 })
1378 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1379 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1380 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1381 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1383 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1384 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1385 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1386 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1387 setOperationAction(ISD::VSELECT, VT, Custom);
1388 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1389 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1390 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1391 setOperationAction(ISD::MLOAD, VT, Legal);
1392 setOperationAction(ISD::MSTORE, VT, Legal);
1393 setOperationAction(ISD::MGATHER, VT, Legal);
1394 setOperationAction(ISD::MSCATTER, VT, Custom);
1396 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1397 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1398 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1402 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1403 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1404 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1406 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1407 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1409 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1410 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1411 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1412 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1413 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1414 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1416 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1417 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1418 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1419 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1420 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1421 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1422 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1423 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1424 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1425 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1426 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1427 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1428 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1429 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1430 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1431 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1432 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1433 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1434 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1435 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1436 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1437 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1438 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1439 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1440 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1441 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1442 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1443 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1444 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1445 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1446 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1447 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1448 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1449 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1450 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1451 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1452 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1453 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1454 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1455 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1456 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1457 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1458 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1459 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1460 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1462 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1464 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1465 if (Subtarget.hasVLX()) {
1466 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1467 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1470 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1471 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1472 setOperationAction(ISD::MLOAD, VT, Action);
1473 setOperationAction(ISD::MSTORE, VT, Action);
1476 if (Subtarget.hasCDI()) {
1477 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1478 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1481 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1482 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1483 setOperationAction(ISD::VSELECT, VT, Custom);
1484 setOperationAction(ISD::ABS, VT, Legal);
1485 setOperationAction(ISD::SRL, VT, Custom);
1486 setOperationAction(ISD::SHL, VT, Custom);
1487 setOperationAction(ISD::SRA, VT, Custom);
1488 setOperationAction(ISD::MLOAD, VT, Legal);
1489 setOperationAction(ISD::MSTORE, VT, Legal);
1490 setOperationAction(ISD::CTPOP, VT, Custom);
1491 setOperationAction(ISD::CTTZ, VT, Custom);
1492 setOperationAction(ISD::SMAX, VT, Legal);
1493 setOperationAction(ISD::UMAX, VT, Legal);
1494 setOperationAction(ISD::SMIN, VT, Legal);
1495 setOperationAction(ISD::UMIN, VT, Legal);
1497 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1498 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1499 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1502 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1503 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1504 if (Subtarget.hasVLX()) {
1505 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1506 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1507 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1512 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1513 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1514 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1516 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1517 setOperationAction(ISD::ADD, VT, Custom);
1518 setOperationAction(ISD::SUB, VT, Custom);
1519 setOperationAction(ISD::MUL, VT, Custom);
1520 setOperationAction(ISD::VSELECT, VT, Expand);
1522 setOperationAction(ISD::TRUNCATE, VT, Custom);
1523 setOperationAction(ISD::SETCC, VT, Custom);
1524 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1525 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1526 setOperationAction(ISD::SELECT, VT, Custom);
1527 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1528 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1531 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1532 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1533 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1534 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1536 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1537 setOperationAction(ISD::SMAX, VT, Legal);
1538 setOperationAction(ISD::UMAX, VT, Legal);
1539 setOperationAction(ISD::SMIN, VT, Legal);
1540 setOperationAction(ISD::UMIN, VT, Legal);
1544 // We want to custom lower some of our intrinsics.
1545 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1546 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1547 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1548 if (!Subtarget.is64Bit()) {
1549 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1550 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1553 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1554 // handle type legalization for these operations here.
1556 // FIXME: We really should do custom legalization for addition and
1557 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1558 // than generic legalization for 64-bit multiplication-with-overflow, though.
1559 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1560 if (VT == MVT::i64 && !Subtarget.is64Bit())
1562 // Add/Sub/Mul with overflow operations are custom lowered.
1563 setOperationAction(ISD::SADDO, VT, Custom);
1564 setOperationAction(ISD::UADDO, VT, Custom);
1565 setOperationAction(ISD::SSUBO, VT, Custom);
1566 setOperationAction(ISD::USUBO, VT, Custom);
1567 setOperationAction(ISD::SMULO, VT, Custom);
1568 setOperationAction(ISD::UMULO, VT, Custom);
1570 // Support carry in as value rather than glue.
1571 setOperationAction(ISD::ADDCARRY, VT, Custom);
1572 setOperationAction(ISD::SUBCARRY, VT, Custom);
1575 if (!Subtarget.is64Bit()) {
1576 // These libcalls are not available in 32-bit.
1577 setLibcallName(RTLIB::SHL_I128, nullptr);
1578 setLibcallName(RTLIB::SRL_I128, nullptr);
1579 setLibcallName(RTLIB::SRA_I128, nullptr);
1582 // Combine sin / cos into one node or libcall if possible.
1583 if (Subtarget.hasSinCos()) {
1584 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1585 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1586 if (Subtarget.isTargetDarwin()) {
1587 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1588 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1589 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1590 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1594 if (Subtarget.isTargetWin64()) {
1595 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1596 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1597 setOperationAction(ISD::SREM, MVT::i128, Custom);
1598 setOperationAction(ISD::UREM, MVT::i128, Custom);
1599 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1600 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1603 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1604 // is. We should promote the value to 64-bits to solve this.
1605 // This is what the CRT headers do - `fmodf` is an inline header
1606 // function casting to f64 and calling `fmod`.
1607 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1608 Subtarget.isTargetWindowsItanium()))
1609 for (ISD::NodeType Op :
1610 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1611 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1612 if (isOperationExpand(Op, MVT::f32))
1613 setOperationAction(Op, MVT::f32, Promote);
1615 // We have target-specific dag combine patterns for the following nodes:
1616 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1617 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1618 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1619 setTargetDAGCombine(ISD::BITCAST);
1620 setTargetDAGCombine(ISD::VSELECT);
1621 setTargetDAGCombine(ISD::SELECT);
1622 setTargetDAGCombine(ISD::SHL);
1623 setTargetDAGCombine(ISD::SRA);
1624 setTargetDAGCombine(ISD::SRL);
1625 setTargetDAGCombine(ISD::OR);
1626 setTargetDAGCombine(ISD::AND);
1627 setTargetDAGCombine(ISD::ADD);
1628 setTargetDAGCombine(ISD::FADD);
1629 setTargetDAGCombine(ISD::FSUB);
1630 setTargetDAGCombine(ISD::FNEG);
1631 setTargetDAGCombine(ISD::FMA);
1632 setTargetDAGCombine(ISD::FMINNUM);
1633 setTargetDAGCombine(ISD::FMAXNUM);
1634 setTargetDAGCombine(ISD::SUB);
1635 setTargetDAGCombine(ISD::LOAD);
1636 setTargetDAGCombine(ISD::MLOAD);
1637 setTargetDAGCombine(ISD::STORE);
1638 setTargetDAGCombine(ISD::MSTORE);
1639 setTargetDAGCombine(ISD::TRUNCATE);
1640 setTargetDAGCombine(ISD::ZERO_EXTEND);
1641 setTargetDAGCombine(ISD::ANY_EXTEND);
1642 setTargetDAGCombine(ISD::SIGN_EXTEND);
1643 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1644 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1645 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1646 setTargetDAGCombine(ISD::SINT_TO_FP);
1647 setTargetDAGCombine(ISD::UINT_TO_FP);
1648 setTargetDAGCombine(ISD::SETCC);
1649 setTargetDAGCombine(ISD::MUL);
1650 setTargetDAGCombine(ISD::XOR);
1651 setTargetDAGCombine(ISD::MSCATTER);
1652 setTargetDAGCombine(ISD::MGATHER);
1654 computeRegisterProperties(Subtarget.getRegisterInfo());
1656 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1657 MaxStoresPerMemsetOptSize = 8;
1658 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1659 MaxStoresPerMemcpyOptSize = 4;
1660 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1661 MaxStoresPerMemmoveOptSize = 4;
1662 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1663 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1665 // An out-of-order CPU can speculatively execute past a predictable branch,
1666 // but a conditional move could be stalled by an expensive earlier operation.
1667 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1668 EnableExtLdPromotion = true;
1669 setPrefFunctionAlignment(4); // 2^4 bytes.
1671 verifyIntrinsicTables();
1674 // This has so far only been implemented for 64-bit MachO.
1675 bool X86TargetLowering::useLoadStackGuardNode() const {
1676 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1679 TargetLoweringBase::LegalizeTypeAction
1680 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1681 if (ExperimentalVectorWideningLegalization &&
1682 VT.getVectorNumElements() != 1 &&
1683 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1684 return TypeWidenVector;
1686 return TargetLoweringBase::getPreferredVectorAction(VT);
1689 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1690 LLVMContext& Context,
1695 if (VT.isSimple()) {
1696 MVT VVT = VT.getSimpleVT();
1697 const unsigned NumElts = VVT.getVectorNumElements();
1698 MVT EltVT = VVT.getVectorElementType();
1699 if (VVT.is512BitVector()) {
1700 if (Subtarget.hasAVX512())
1701 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1702 EltVT == MVT::f32 || EltVT == MVT::f64)
1704 case 8: return MVT::v8i1;
1705 case 16: return MVT::v16i1;
1707 if (Subtarget.hasBWI())
1708 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1710 case 32: return MVT::v32i1;
1711 case 64: return MVT::v64i1;
1715 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1716 return MVT::getVectorVT(MVT::i1, NumElts);
1718 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1719 EVT LegalVT = getTypeToTransformTo(Context, VT);
1720 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1723 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1725 case 2: return MVT::v2i1;
1726 case 4: return MVT::v4i1;
1727 case 8: return MVT::v8i1;
1731 return VT.changeVectorElementTypeToInteger();
1734 /// Helper for getByValTypeAlignment to determine
1735 /// the desired ByVal argument alignment.
1736 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1739 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1740 if (VTy->getBitWidth() == 128)
1742 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1743 unsigned EltAlign = 0;
1744 getMaxByValAlign(ATy->getElementType(), EltAlign);
1745 if (EltAlign > MaxAlign)
1746 MaxAlign = EltAlign;
1747 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1748 for (auto *EltTy : STy->elements()) {
1749 unsigned EltAlign = 0;
1750 getMaxByValAlign(EltTy, EltAlign);
1751 if (EltAlign > MaxAlign)
1752 MaxAlign = EltAlign;
1759 /// Return the desired alignment for ByVal aggregate
1760 /// function arguments in the caller parameter area. For X86, aggregates
1761 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1762 /// are at 4-byte boundaries.
1763 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1764 const DataLayout &DL) const {
1765 if (Subtarget.is64Bit()) {
1766 // Max of 8 and alignment of type.
1767 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1774 if (Subtarget.hasSSE1())
1775 getMaxByValAlign(Ty, Align);
1779 /// Returns the target specific optimal type for load
1780 /// and store operations as a result of memset, memcpy, and memmove
1781 /// lowering. If DstAlign is zero that means it's safe to destination
1782 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1783 /// means there isn't a need to check it against alignment requirement,
1784 /// probably because the source does not need to be loaded. If 'IsMemset' is
1785 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1786 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1787 /// source is constant so it does not need to be loaded.
1788 /// It returns EVT::Other if the type should be determined using generic
1789 /// target-independent logic.
1791 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1792 unsigned DstAlign, unsigned SrcAlign,
1793 bool IsMemset, bool ZeroMemset,
1795 MachineFunction &MF) const {
1796 const Function *F = MF.getFunction();
1797 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1799 (!Subtarget.isUnalignedMem16Slow() ||
1800 ((DstAlign == 0 || DstAlign >= 16) &&
1801 (SrcAlign == 0 || SrcAlign >= 16)))) {
1802 // FIXME: Check if unaligned 32-byte accesses are slow.
1803 if (Size >= 32 && Subtarget.hasAVX()) {
1804 // Although this isn't a well-supported type for AVX1, we'll let
1805 // legalization and shuffle lowering produce the optimal codegen. If we
1806 // choose an optimal type with a vector element larger than a byte,
1807 // getMemsetStores() may create an intermediate splat (using an integer
1808 // multiply) before we splat as a vector.
1811 if (Subtarget.hasSSE2())
1813 // TODO: Can SSE1 handle a byte vector?
1814 if (Subtarget.hasSSE1())
1816 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1817 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1818 // Do not use f64 to lower memcpy if source is string constant. It's
1819 // better to use i32 to avoid the loads.
1820 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1821 // The gymnastics of splatting a byte value into an XMM register and then
1822 // only using 8-byte stores (because this is a CPU with slow unaligned
1823 // 16-byte accesses) makes that a loser.
1827 // This is a compromise. If we reach here, unaligned accesses may be slow on
1828 // this target. However, creating smaller, aligned accesses could be even
1829 // slower and would certainly be a lot more code.
1830 if (Subtarget.is64Bit() && Size >= 8)
1835 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1837 return X86ScalarSSEf32;
1838 else if (VT == MVT::f64)
1839 return X86ScalarSSEf64;
1844 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1849 switch (VT.getSizeInBits()) {
1851 // 8-byte and under are always assumed to be fast.
1855 *Fast = !Subtarget.isUnalignedMem16Slow();
1858 *Fast = !Subtarget.isUnalignedMem32Slow();
1860 // TODO: What about AVX-512 (512-bit) accesses?
1863 // Misaligned accesses of any size are always allowed.
1867 /// Return the entry encoding for a jump table in the
1868 /// current function. The returned value is a member of the
1869 /// MachineJumpTableInfo::JTEntryKind enum.
1870 unsigned X86TargetLowering::getJumpTableEncoding() const {
1871 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1873 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1874 return MachineJumpTableInfo::EK_Custom32;
1876 // Otherwise, use the normal jump table encoding heuristics.
1877 return TargetLowering::getJumpTableEncoding();
1880 bool X86TargetLowering::useSoftFloat() const {
1881 return Subtarget.useSoftFloat();
1884 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1885 ArgListTy &Args) const {
1887 // Only relabel X86-32 for C / Stdcall CCs.
1888 if (Subtarget.is64Bit())
1890 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1892 unsigned ParamRegs = 0;
1893 if (auto *M = MF->getFunction()->getParent())
1894 ParamRegs = M->getNumberRegisterParameters();
1896 // Mark the first N int arguments as having reg
1897 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1898 Type *T = Args[Idx].Ty;
1899 if (T->isPointerTy() || T->isIntegerTy())
1900 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1901 unsigned numRegs = 1;
1902 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1904 if (ParamRegs < numRegs)
1906 ParamRegs -= numRegs;
1907 Args[Idx].IsInReg = true;
1913 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1914 const MachineBasicBlock *MBB,
1915 unsigned uid,MCContext &Ctx) const{
1916 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1917 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1919 return MCSymbolRefExpr::create(MBB->getSymbol(),
1920 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1923 /// Returns relocation base for the given PIC jumptable.
1924 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1925 SelectionDAG &DAG) const {
1926 if (!Subtarget.is64Bit())
1927 // This doesn't have SDLoc associated with it, but is not really the
1928 // same as a Register.
1929 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1930 getPointerTy(DAG.getDataLayout()));
1934 /// This returns the relocation base for the given PIC jumptable,
1935 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1936 const MCExpr *X86TargetLowering::
1937 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1938 MCContext &Ctx) const {
1939 // X86-64 uses RIP relative addressing based on the jump table label.
1940 if (Subtarget.isPICStyleRIPRel())
1941 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1943 // Otherwise, the reference is relative to the PIC base.
1944 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1947 std::pair<const TargetRegisterClass *, uint8_t>
1948 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1950 const TargetRegisterClass *RRC = nullptr;
1952 switch (VT.SimpleTy) {
1954 return TargetLowering::findRepresentativeClass(TRI, VT);
1955 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1956 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1959 RRC = &X86::VR64RegClass;
1961 case MVT::f32: case MVT::f64:
1962 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1963 case MVT::v4f32: case MVT::v2f64:
1964 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1965 case MVT::v8f32: case MVT::v4f64:
1966 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1967 case MVT::v16f32: case MVT::v8f64:
1968 RRC = &X86::VR128XRegClass;
1971 return std::make_pair(RRC, Cost);
1974 unsigned X86TargetLowering::getAddressSpace() const {
1975 if (Subtarget.is64Bit())
1976 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1980 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1981 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1982 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1985 static Constant* SegmentOffset(IRBuilder<> &IRB,
1986 unsigned Offset, unsigned AddressSpace) {
1987 return ConstantExpr::getIntToPtr(
1988 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1989 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1992 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1993 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
1994 // tcbhead_t; use it instead of the usual global variable (see
1995 // sysdeps/{i386,x86_64}/nptl/tls.h)
1996 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
1997 if (Subtarget.isTargetFuchsia()) {
1998 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
1999 return SegmentOffset(IRB, 0x10, getAddressSpace());
2001 // %fs:0x28, unless we're using a Kernel code model, in which case
2002 // it's %gs:0x28. gs:0x14 on i386.
2003 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2004 return SegmentOffset(IRB, Offset, getAddressSpace());
2008 return TargetLowering::getIRStackGuard(IRB);
2011 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2012 // MSVC CRT provides functionalities for stack protection.
2013 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2014 // MSVC CRT has a global variable holding security cookie.
2015 M.getOrInsertGlobal("__security_cookie",
2016 Type::getInt8PtrTy(M.getContext()));
2018 // MSVC CRT has a function to validate security cookie.
2019 auto *SecurityCheckCookie = cast<Function>(
2020 M.getOrInsertFunction("__security_check_cookie",
2021 Type::getVoidTy(M.getContext()),
2022 Type::getInt8PtrTy(M.getContext())));
2023 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2024 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2027 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2028 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2030 TargetLowering::insertSSPDeclarations(M);
2033 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2034 // MSVC CRT has a global variable holding security cookie.
2035 if (Subtarget.getTargetTriple().isOSMSVCRT())
2036 return M.getGlobalVariable("__security_cookie");
2037 return TargetLowering::getSDagStackGuard(M);
2040 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2041 // MSVC CRT has a function to validate security cookie.
2042 if (Subtarget.getTargetTriple().isOSMSVCRT())
2043 return M.getFunction("__security_check_cookie");
2044 return TargetLowering::getSSPStackGuardCheck(M);
2047 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2048 if (Subtarget.getTargetTriple().isOSContiki())
2049 return getDefaultSafeStackPointerLocation(IRB, false);
2051 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2052 // definition of TLS_SLOT_SAFESTACK in
2053 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2054 if (Subtarget.isTargetAndroid()) {
2055 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2057 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2058 return SegmentOffset(IRB, Offset, getAddressSpace());
2061 // Fuchsia is similar.
2062 if (Subtarget.isTargetFuchsia()) {
2063 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2064 return SegmentOffset(IRB, 0x18, getAddressSpace());
2067 return TargetLowering::getSafeStackPointerLocation(IRB);
2070 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2071 unsigned DestAS) const {
2072 assert(SrcAS != DestAS && "Expected different address spaces!");
2074 return SrcAS < 256 && DestAS < 256;
2077 //===----------------------------------------------------------------------===//
2078 // Return Value Calling Convention Implementation
2079 //===----------------------------------------------------------------------===//
2081 #include "X86GenCallingConv.inc"
2083 bool X86TargetLowering::CanLowerReturn(
2084 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2085 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2086 SmallVector<CCValAssign, 16> RVLocs;
2087 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2088 return CCInfo.CheckReturn(Outs, RetCC_X86);
2091 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2092 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2096 /// Lowers masks values (v*i1) to the local register values
2097 /// \returns DAG node after lowering to register type
2098 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2099 const SDLoc &Dl, SelectionDAG &DAG) {
2100 EVT ValVT = ValArg.getValueType();
2102 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2103 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2104 // Two stage lowering might be required
2105 // bitcast: v8i1 -> i8 / v16i1 -> i16
2106 // anyextend: i8 -> i32 / i16 -> i32
2107 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2108 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2109 if (ValLoc == MVT::i32)
2110 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2112 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2113 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2114 // One stage lowering is required
2115 // bitcast: v32i1 -> i32 / v64i1 -> i64
2116 return DAG.getBitcast(ValLoc, ValArg);
2118 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2121 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2122 static void Passv64i1ArgInRegs(
2123 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2124 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2125 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2126 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2127 "Expected AVX512BW or AVX512BMI target!");
2128 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2129 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2130 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2131 "The value should reside in two registers");
2133 // Before splitting the value we cast it to i64
2134 Arg = DAG.getBitcast(MVT::i64, Arg);
2136 // Splitting the value into two i32 types
2138 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2139 DAG.getConstant(0, Dl, MVT::i32));
2140 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2141 DAG.getConstant(1, Dl, MVT::i32));
2143 // Attach the two i32 types into corresponding registers
2144 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2145 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2149 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2151 const SmallVectorImpl<ISD::OutputArg> &Outs,
2152 const SmallVectorImpl<SDValue> &OutVals,
2153 const SDLoc &dl, SelectionDAG &DAG) const {
2154 MachineFunction &MF = DAG.getMachineFunction();
2155 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2157 // In some cases we need to disable registers from the default CSR list.
2158 // For example, when they are used for argument passing.
2159 bool ShouldDisableCalleeSavedRegister =
2160 CallConv == CallingConv::X86_RegCall ||
2161 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2163 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2164 report_fatal_error("X86 interrupts may not return any value");
2166 SmallVector<CCValAssign, 16> RVLocs;
2167 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2168 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2171 SmallVector<SDValue, 6> RetOps;
2172 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2173 // Operand #1 = Bytes To Pop
2174 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2177 // Copy the result values into the output registers.
2178 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2180 CCValAssign &VA = RVLocs[I];
2181 assert(VA.isRegLoc() && "Can only return in registers!");
2183 // Add the register to the CalleeSaveDisableRegs list.
2184 if (ShouldDisableCalleeSavedRegister)
2185 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2187 SDValue ValToCopy = OutVals[OutsIndex];
2188 EVT ValVT = ValToCopy.getValueType();
2190 // Promote values to the appropriate types.
2191 if (VA.getLocInfo() == CCValAssign::SExt)
2192 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2193 else if (VA.getLocInfo() == CCValAssign::ZExt)
2194 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2195 else if (VA.getLocInfo() == CCValAssign::AExt) {
2196 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2197 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2199 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2201 else if (VA.getLocInfo() == CCValAssign::BCvt)
2202 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2204 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2205 "Unexpected FP-extend for return value.");
2207 // If this is x86-64, and we disabled SSE, we can't return FP values,
2208 // or SSE or MMX vectors.
2209 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2210 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2211 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2212 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2213 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2214 } else if (ValVT == MVT::f64 &&
2215 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2216 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2217 // llvm-gcc has never done it right and no one has noticed, so this
2218 // should be OK for now.
2219 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2220 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2223 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2224 // the RET instruction and handled by the FP Stackifier.
2225 if (VA.getLocReg() == X86::FP0 ||
2226 VA.getLocReg() == X86::FP1) {
2227 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2228 // change the value to the FP stack register class.
2229 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2230 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2231 RetOps.push_back(ValToCopy);
2232 // Don't emit a copytoreg.
2236 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2237 // which is returned in RAX / RDX.
2238 if (Subtarget.is64Bit()) {
2239 if (ValVT == MVT::x86mmx) {
2240 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2241 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2242 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2244 // If we don't have SSE2 available, convert to v4f32 so the generated
2245 // register is legal.
2246 if (!Subtarget.hasSSE2())
2247 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2252 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2254 if (VA.needsCustom()) {
2255 assert(VA.getValVT() == MVT::v64i1 &&
2256 "Currently the only custom case is when we split v64i1 to 2 regs");
2258 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2261 assert(2 == RegsToPass.size() &&
2262 "Expecting two registers after Pass64BitArgInRegs");
2264 // Add the second register to the CalleeSaveDisableRegs list.
2265 if (ShouldDisableCalleeSavedRegister)
2266 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2268 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2271 // Add nodes to the DAG and add the values into the RetOps list
2272 for (auto &Reg : RegsToPass) {
2273 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2274 Flag = Chain.getValue(1);
2275 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2279 // Swift calling convention does not require we copy the sret argument
2280 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2282 // All x86 ABIs require that for returning structs by value we copy
2283 // the sret argument into %rax/%eax (depending on ABI) for the return.
2284 // We saved the argument into a virtual register in the entry block,
2285 // so now we copy the value out and into %rax/%eax.
2287 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2288 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2289 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2290 // either case FuncInfo->setSRetReturnReg() will have been called.
2291 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2292 // When we have both sret and another return value, we should use the
2293 // original Chain stored in RetOps[0], instead of the current Chain updated
2294 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2296 // For the case of sret and another return value, we have
2297 // Chain_0 at the function entry
2298 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2299 // If we use Chain_1 in getCopyFromReg, we will have
2300 // Val = getCopyFromReg(Chain_1)
2301 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2303 // getCopyToReg(Chain_0) will be glued together with
2304 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2305 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2306 // Data dependency from Unit B to Unit A due to usage of Val in
2307 // getCopyToReg(Chain_1, Val)
2308 // Chain dependency from Unit A to Unit B
2310 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2311 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2312 getPointerTy(MF.getDataLayout()));
2315 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2316 X86::RAX : X86::EAX;
2317 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2318 Flag = Chain.getValue(1);
2320 // RAX/EAX now acts like a return value.
2322 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2324 // Add the returned register to the CalleeSaveDisableRegs list.
2325 if (ShouldDisableCalleeSavedRegister)
2326 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2329 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2330 const MCPhysReg *I =
2331 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2334 if (X86::GR64RegClass.contains(*I))
2335 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2337 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2341 RetOps[0] = Chain; // Update chain.
2343 // Add the flag if we have it.
2345 RetOps.push_back(Flag);
2347 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2348 if (CallConv == CallingConv::X86_INTR)
2349 opcode = X86ISD::IRET;
2350 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2353 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2354 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2357 SDValue TCChain = Chain;
2358 SDNode *Copy = *N->use_begin();
2359 if (Copy->getOpcode() == ISD::CopyToReg) {
2360 // If the copy has a glue operand, we conservatively assume it isn't safe to
2361 // perform a tail call.
2362 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2364 TCChain = Copy->getOperand(0);
2365 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2368 bool HasRet = false;
2369 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2371 if (UI->getOpcode() != X86ISD::RET_FLAG)
2373 // If we are returning more than one value, we can definitely
2374 // not make a tail call see PR19530
2375 if (UI->getNumOperands() > 4)
2377 if (UI->getNumOperands() == 4 &&
2378 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2390 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2391 ISD::NodeType ExtendKind) const {
2392 MVT ReturnMVT = MVT::i32;
2394 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2395 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2396 // The ABI does not require i1, i8 or i16 to be extended.
2398 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2399 // always extending i8/i16 return values, so keep doing that for now.
2401 ReturnMVT = MVT::i8;
2404 EVT MinVT = getRegisterType(Context, ReturnMVT);
2405 return VT.bitsLT(MinVT) ? MinVT : VT;
2408 /// Reads two 32 bit registers and creates a 64 bit mask value.
2409 /// \param VA The current 32 bit value that need to be assigned.
2410 /// \param NextVA The next 32 bit value that need to be assigned.
2411 /// \param Root The parent DAG node.
2412 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2413 /// glue purposes. In the case the DAG is already using
2414 /// physical register instead of virtual, we should glue
2415 /// our new SDValue to InFlag SDvalue.
2416 /// \return a new SDvalue of size 64bit.
2417 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2418 SDValue &Root, SelectionDAG &DAG,
2419 const SDLoc &Dl, const X86Subtarget &Subtarget,
2420 SDValue *InFlag = nullptr) {
2421 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2422 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2423 assert(VA.getValVT() == MVT::v64i1 &&
2424 "Expecting first location of 64 bit width type");
2425 assert(NextVA.getValVT() == VA.getValVT() &&
2426 "The locations should have the same type");
2427 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2428 "The values should reside in two registers");
2432 SDValue ArgValueLo, ArgValueHi;
2434 MachineFunction &MF = DAG.getMachineFunction();
2435 const TargetRegisterClass *RC = &X86::GR32RegClass;
2437 // Read a 32 bit value from the registers
2438 if (nullptr == InFlag) {
2439 // When no physical register is present,
2440 // create an intermediate virtual register
2441 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2442 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2443 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2444 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2446 // When a physical register is available read the value from it and glue
2447 // the reads together.
2449 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2450 *InFlag = ArgValueLo.getValue(2);
2452 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2453 *InFlag = ArgValueHi.getValue(2);
2456 // Convert the i32 type into v32i1 type
2457 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2459 // Convert the i32 type into v32i1 type
2460 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2462 // Concatenate the two values together
2463 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2466 /// The function will lower a register of various sizes (8/16/32/64)
2467 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2468 /// \returns a DAG node contains the operand after lowering to mask type.
2469 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2470 const EVT &ValLoc, const SDLoc &Dl,
2471 SelectionDAG &DAG) {
2472 SDValue ValReturned = ValArg;
2474 if (ValVT == MVT::v1i1)
2475 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2477 if (ValVT == MVT::v64i1) {
2478 // In 32 bit machine, this case is handled by getv64i1Argument
2479 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2480 // In 64 bit machine, There is no need to truncate the value only bitcast
2483 switch (ValVT.getSimpleVT().SimpleTy) {
2494 llvm_unreachable("Expecting a vector of i1 types");
2497 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2499 return DAG.getBitcast(ValVT, ValReturned);
2502 /// Lower the result values of a call into the
2503 /// appropriate copies out of appropriate physical registers.
2505 SDValue X86TargetLowering::LowerCallResult(
2506 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2507 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2508 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2509 uint32_t *RegMask) const {
2511 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2512 // Assign locations to each value returned by this call.
2513 SmallVector<CCValAssign, 16> RVLocs;
2514 bool Is64Bit = Subtarget.is64Bit();
2515 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2517 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2519 // Copy all of the result registers out of their specified physreg.
2520 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2522 CCValAssign &VA = RVLocs[I];
2523 EVT CopyVT = VA.getLocVT();
2525 // In some calling conventions we need to remove the used registers
2526 // from the register mask.
2528 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2529 SubRegs.isValid(); ++SubRegs)
2530 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2533 // If this is x86-64, and we disabled SSE, we can't return FP values
2534 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2535 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2536 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2537 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2540 // If we prefer to use the value in xmm registers, copy it out as f80 and
2541 // use a truncate to move it from fp stack reg to xmm reg.
2542 bool RoundAfterCopy = false;
2543 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2544 isScalarFPTypeInSSEReg(VA.getValVT())) {
2545 if (!Subtarget.hasX87())
2546 report_fatal_error("X87 register return with X87 disabled");
2548 RoundAfterCopy = (CopyVT != VA.getLocVT());
2552 if (VA.needsCustom()) {
2553 assert(VA.getValVT() == MVT::v64i1 &&
2554 "Currently the only custom case is when we split v64i1 to 2 regs");
2556 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2558 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2560 Val = Chain.getValue(0);
2561 InFlag = Chain.getValue(2);
2565 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2566 // This truncation won't change the value.
2567 DAG.getIntPtrConstant(1, dl));
2569 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2570 if (VA.getValVT().isVector() &&
2571 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2572 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2573 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2574 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2576 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2579 InVals.push_back(Val);
2585 //===----------------------------------------------------------------------===//
2586 // C & StdCall & Fast Calling Convention implementation
2587 //===----------------------------------------------------------------------===//
2588 // StdCall calling convention seems to be standard for many Windows' API
2589 // routines and around. It differs from C calling convention just a little:
2590 // callee should clean up the stack, not caller. Symbols should be also
2591 // decorated in some fancy way :) It doesn't support any vector arguments.
2592 // For info on fast calling convention see Fast Calling Convention (tail call)
2593 // implementation LowerX86_32FastCCCallTo.
2595 /// CallIsStructReturn - Determines whether a call uses struct return
2597 enum StructReturnType {
2602 static StructReturnType
2603 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2605 return NotStructReturn;
2607 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2608 if (!Flags.isSRet())
2609 return NotStructReturn;
2610 if (Flags.isInReg() || IsMCU)
2611 return RegStructReturn;
2612 return StackStructReturn;
2615 /// Determines whether a function uses struct return semantics.
2616 static StructReturnType
2617 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2619 return NotStructReturn;
2621 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2622 if (!Flags.isSRet())
2623 return NotStructReturn;
2624 if (Flags.isInReg() || IsMCU)
2625 return RegStructReturn;
2626 return StackStructReturn;
2629 /// Make a copy of an aggregate at address specified by "Src" to address
2630 /// "Dst" with size and alignment information specified by the specific
2631 /// parameter attribute. The copy will be passed as a byval function parameter.
2632 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2633 SDValue Chain, ISD::ArgFlagsTy Flags,
2634 SelectionDAG &DAG, const SDLoc &dl) {
2635 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2637 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2638 /*isVolatile*/false, /*AlwaysInline=*/true,
2639 /*isTailCall*/false,
2640 MachinePointerInfo(), MachinePointerInfo());
2643 /// Return true if the calling convention is one that we can guarantee TCO for.
2644 static bool canGuaranteeTCO(CallingConv::ID CC) {
2645 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2646 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2647 CC == CallingConv::HHVM);
2650 /// Return true if we might ever do TCO for calls with this calling convention.
2651 static bool mayTailCallThisCC(CallingConv::ID CC) {
2653 // C calling conventions:
2654 case CallingConv::C:
2655 case CallingConv::X86_64_Win64:
2656 case CallingConv::X86_64_SysV:
2657 // Callee pop conventions:
2658 case CallingConv::X86_ThisCall:
2659 case CallingConv::X86_StdCall:
2660 case CallingConv::X86_VectorCall:
2661 case CallingConv::X86_FastCall:
2664 return canGuaranteeTCO(CC);
2668 /// Return true if the function is being made into a tailcall target by
2669 /// changing its ABI.
2670 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2671 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2674 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2676 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2677 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2680 ImmutableCallSite CS(CI);
2681 CallingConv::ID CalleeCC = CS.getCallingConv();
2682 if (!mayTailCallThisCC(CalleeCC))
2689 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2690 const SmallVectorImpl<ISD::InputArg> &Ins,
2691 const SDLoc &dl, SelectionDAG &DAG,
2692 const CCValAssign &VA,
2693 MachineFrameInfo &MFI, unsigned i) const {
2694 // Create the nodes corresponding to a load from this parameter slot.
2695 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2696 bool AlwaysUseMutable = shouldGuaranteeTCO(
2697 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2698 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2700 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2702 // If value is passed by pointer we have address passed instead of the value
2703 // itself. No need to extend if the mask value and location share the same
2705 bool ExtendedInMem =
2706 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2707 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2709 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2710 ValVT = VA.getLocVT();
2712 ValVT = VA.getValVT();
2714 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2715 // taken by a return address.
2717 if (CallConv == CallingConv::X86_INTR) {
2718 // X86 interrupts may take one or two arguments.
2719 // On the stack there will be no return address as in regular call.
2720 // Offset of last argument need to be set to -4/-8 bytes.
2721 // Where offset of the first argument out of two, should be set to 0 bytes.
2722 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2723 if (Subtarget.is64Bit() && Ins.size() == 2) {
2724 // The stack pointer needs to be realigned for 64 bit handlers with error
2725 // code, so the argument offset changes by 8 bytes.
2730 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2731 // changed with more analysis.
2732 // In case of tail call optimization mark all arguments mutable. Since they
2733 // could be overwritten by lowering of arguments in case of a tail call.
2734 if (Flags.isByVal()) {
2735 unsigned Bytes = Flags.getByValSize();
2736 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2737 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2738 // Adjust SP offset of interrupt parameter.
2739 if (CallConv == CallingConv::X86_INTR) {
2740 MFI.setObjectOffset(FI, Offset);
2742 return DAG.getFrameIndex(FI, PtrVT);
2745 // This is an argument in memory. We might be able to perform copy elision.
2746 if (Flags.isCopyElisionCandidate()) {
2747 EVT ArgVT = Ins[i].ArgVT;
2749 if (Ins[i].PartOffset == 0) {
2750 // If this is a one-part value or the first part of a multi-part value,
2751 // create a stack object for the entire argument value type and return a
2752 // load from our portion of it. This assumes that if the first part of an
2753 // argument is in memory, the rest will also be in memory.
2754 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2755 /*Immutable=*/false);
2756 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2758 ValVT, dl, Chain, PartAddr,
2759 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2761 // This is not the first piece of an argument in memory. See if there is
2762 // already a fixed stack object including this offset. If so, assume it
2763 // was created by the PartOffset == 0 branch above and create a load from
2764 // the appropriate offset into it.
2765 int64_t PartBegin = VA.getLocMemOffset();
2766 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2767 int FI = MFI.getObjectIndexBegin();
2768 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2769 int64_t ObjBegin = MFI.getObjectOffset(FI);
2770 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2771 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2774 if (MFI.isFixedObjectIndex(FI)) {
2776 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2777 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2779 ValVT, dl, Chain, Addr,
2780 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2781 Ins[i].PartOffset));
2786 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2787 VA.getLocMemOffset(), isImmutable);
2789 // Set SExt or ZExt flag.
2790 if (VA.getLocInfo() == CCValAssign::ZExt) {
2791 MFI.setObjectZExt(FI, true);
2792 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2793 MFI.setObjectSExt(FI, true);
2796 // Adjust SP offset of interrupt parameter.
2797 if (CallConv == CallingConv::X86_INTR) {
2798 MFI.setObjectOffset(FI, Offset);
2801 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2802 SDValue Val = DAG.getLoad(
2803 ValVT, dl, Chain, FIN,
2804 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2805 return ExtendedInMem
2806 ? (VA.getValVT().isVector()
2807 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2808 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2812 // FIXME: Get this from tablegen.
2813 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2814 const X86Subtarget &Subtarget) {
2815 assert(Subtarget.is64Bit());
2817 if (Subtarget.isCallingConvWin64(CallConv)) {
2818 static const MCPhysReg GPR64ArgRegsWin64[] = {
2819 X86::RCX, X86::RDX, X86::R8, X86::R9
2821 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2824 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2825 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2827 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2830 // FIXME: Get this from tablegen.
2831 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2832 CallingConv::ID CallConv,
2833 const X86Subtarget &Subtarget) {
2834 assert(Subtarget.is64Bit());
2835 if (Subtarget.isCallingConvWin64(CallConv)) {
2836 // The XMM registers which might contain var arg parameters are shadowed
2837 // in their paired GPR. So we only need to save the GPR to their home
2839 // TODO: __vectorcall will change this.
2843 const Function *Fn = MF.getFunction();
2844 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2845 bool isSoftFloat = Subtarget.useSoftFloat();
2846 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2847 "SSE register cannot be used when SSE is disabled!");
2848 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2849 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2853 static const MCPhysReg XMMArgRegs64Bit[] = {
2854 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2855 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2857 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2861 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2862 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2863 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2864 return A.getValNo() < B.getValNo();
2869 SDValue X86TargetLowering::LowerFormalArguments(
2870 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2871 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2872 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2873 MachineFunction &MF = DAG.getMachineFunction();
2874 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2875 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2877 const Function *Fn = MF.getFunction();
2878 if (Fn->hasExternalLinkage() &&
2879 Subtarget.isTargetCygMing() &&
2880 Fn->getName() == "main")
2881 FuncInfo->setForceFramePointer(true);
2883 MachineFrameInfo &MFI = MF.getFrameInfo();
2884 bool Is64Bit = Subtarget.is64Bit();
2885 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2888 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2889 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2891 if (CallConv == CallingConv::X86_INTR) {
2892 bool isLegal = Ins.size() == 1 ||
2893 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2894 (!Is64Bit && Ins[1].VT == MVT::i32)));
2896 report_fatal_error("X86 interrupts may take one or two arguments");
2899 // Assign locations to all of the incoming arguments.
2900 SmallVector<CCValAssign, 16> ArgLocs;
2901 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2903 // Allocate shadow area for Win64.
2905 CCInfo.AllocateStack(32, 8);
2907 CCInfo.AnalyzeArguments(Ins, CC_X86);
2909 // In vectorcall calling convention a second pass is required for the HVA
2911 if (CallingConv::X86_VectorCall == CallConv) {
2912 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2915 // The next loop assumes that the locations are in the same order of the
2917 assert(isSortedByValueNo(ArgLocs) &&
2918 "Argument Location list must be sorted before lowering");
2921 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2923 assert(InsIndex < Ins.size() && "Invalid Ins index");
2924 CCValAssign &VA = ArgLocs[I];
2926 if (VA.isRegLoc()) {
2927 EVT RegVT = VA.getLocVT();
2928 if (VA.needsCustom()) {
2930 VA.getValVT() == MVT::v64i1 &&
2931 "Currently the only custom case is when we split v64i1 to 2 regs");
2933 // v64i1 values, in regcall calling convention, that are
2934 // compiled to 32 bit arch, are split up into two registers.
2936 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2938 const TargetRegisterClass *RC;
2939 if (RegVT == MVT::i32)
2940 RC = &X86::GR32RegClass;
2941 else if (Is64Bit && RegVT == MVT::i64)
2942 RC = &X86::GR64RegClass;
2943 else if (RegVT == MVT::f32)
2944 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2945 else if (RegVT == MVT::f64)
2946 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2947 else if (RegVT == MVT::f80)
2948 RC = &X86::RFP80RegClass;
2949 else if (RegVT == MVT::f128)
2950 RC = &X86::FR128RegClass;
2951 else if (RegVT.is512BitVector())
2952 RC = &X86::VR512RegClass;
2953 else if (RegVT.is256BitVector())
2954 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2955 else if (RegVT.is128BitVector())
2956 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2957 else if (RegVT == MVT::x86mmx)
2958 RC = &X86::VR64RegClass;
2959 else if (RegVT == MVT::v1i1)
2960 RC = &X86::VK1RegClass;
2961 else if (RegVT == MVT::v8i1)
2962 RC = &X86::VK8RegClass;
2963 else if (RegVT == MVT::v16i1)
2964 RC = &X86::VK16RegClass;
2965 else if (RegVT == MVT::v32i1)
2966 RC = &X86::VK32RegClass;
2967 else if (RegVT == MVT::v64i1)
2968 RC = &X86::VK64RegClass;
2970 llvm_unreachable("Unknown argument type!");
2972 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2973 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2976 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2977 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2979 if (VA.getLocInfo() == CCValAssign::SExt)
2980 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2981 DAG.getValueType(VA.getValVT()));
2982 else if (VA.getLocInfo() == CCValAssign::ZExt)
2983 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2984 DAG.getValueType(VA.getValVT()));
2985 else if (VA.getLocInfo() == CCValAssign::BCvt)
2986 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2988 if (VA.isExtInLoc()) {
2989 // Handle MMX values passed in XMM regs.
2990 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2991 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2992 else if (VA.getValVT().isVector() &&
2993 VA.getValVT().getScalarType() == MVT::i1 &&
2994 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2995 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2996 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2997 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2999 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3002 assert(VA.isMemLoc());
3004 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3007 // If value is passed via pointer - do a load.
3008 if (VA.getLocInfo() == CCValAssign::Indirect)
3010 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3012 InVals.push_back(ArgValue);
3015 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3016 // Swift calling convention does not require we copy the sret argument
3017 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3018 if (CallConv == CallingConv::Swift)
3021 // All x86 ABIs require that for returning structs by value we copy the
3022 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3023 // the argument into a virtual register so that we can access it from the
3025 if (Ins[I].Flags.isSRet()) {
3026 unsigned Reg = FuncInfo->getSRetReturnReg();
3028 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3029 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3030 FuncInfo->setSRetReturnReg(Reg);
3032 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3033 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3038 unsigned StackSize = CCInfo.getNextStackOffset();
3039 // Align stack specially for tail calls.
3040 if (shouldGuaranteeTCO(CallConv,
3041 MF.getTarget().Options.GuaranteedTailCallOpt))
3042 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3044 // If the function takes variable number of arguments, make a frame index for
3045 // the start of the first vararg value... for expansion of llvm.va_start. We
3046 // can skip this if there are no va_start calls.
3047 if (MFI.hasVAStart() &&
3048 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3049 CallConv != CallingConv::X86_ThisCall))) {
3050 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3053 // Figure out if XMM registers are in use.
3054 assert(!(Subtarget.useSoftFloat() &&
3055 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3056 "SSE register cannot be used when SSE is disabled!");
3058 // 64-bit calling conventions support varargs and register parameters, so we
3059 // have to do extra work to spill them in the prologue.
3060 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3061 // Find the first unallocated argument registers.
3062 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3063 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3064 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3065 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3066 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3067 "SSE register cannot be used when SSE is disabled!");
3069 // Gather all the live in physical registers.
3070 SmallVector<SDValue, 6> LiveGPRs;
3071 SmallVector<SDValue, 8> LiveXMMRegs;
3073 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3074 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3076 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3078 if (!ArgXMMs.empty()) {
3079 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3080 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3081 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3082 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3083 LiveXMMRegs.push_back(
3084 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3089 // Get to the caller-allocated home save location. Add 8 to account
3090 // for the return address.
3091 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3092 FuncInfo->setRegSaveFrameIndex(
3093 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3094 // Fixup to set vararg frame on shadow area (4 x i64).
3096 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3098 // For X86-64, if there are vararg parameters that are passed via
3099 // registers, then we must store them to their spots on the stack so
3100 // they may be loaded by dereferencing the result of va_next.
3101 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3102 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3103 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3104 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3107 // Store the integer parameter registers.
3108 SmallVector<SDValue, 8> MemOps;
3109 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3110 getPointerTy(DAG.getDataLayout()));
3111 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3112 for (SDValue Val : LiveGPRs) {
3113 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3114 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3116 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3117 MachinePointerInfo::getFixedStack(
3118 DAG.getMachineFunction(),
3119 FuncInfo->getRegSaveFrameIndex(), Offset));
3120 MemOps.push_back(Store);
3124 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3125 // Now store the XMM (fp + vector) parameter registers.
3126 SmallVector<SDValue, 12> SaveXMMOps;
3127 SaveXMMOps.push_back(Chain);
3128 SaveXMMOps.push_back(ALVal);
3129 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3130 FuncInfo->getRegSaveFrameIndex(), dl));
3131 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3132 FuncInfo->getVarArgsFPOffset(), dl));
3133 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3135 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3136 MVT::Other, SaveXMMOps));
3139 if (!MemOps.empty())
3140 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3143 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3144 // Find the largest legal vector type.
3145 MVT VecVT = MVT::Other;
3146 // FIXME: Only some x86_32 calling conventions support AVX512.
3147 if (Subtarget.hasAVX512() &&
3148 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3149 CallConv == CallingConv::Intel_OCL_BI)))
3150 VecVT = MVT::v16f32;
3151 else if (Subtarget.hasAVX())
3153 else if (Subtarget.hasSSE2())
3156 // We forward some GPRs and some vector types.
3157 SmallVector<MVT, 2> RegParmTypes;
3158 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3159 RegParmTypes.push_back(IntVT);
3160 if (VecVT != MVT::Other)
3161 RegParmTypes.push_back(VecVT);
3163 // Compute the set of forwarded registers. The rest are scratch.
3164 SmallVectorImpl<ForwardedRegister> &Forwards =
3165 FuncInfo->getForwardedMustTailRegParms();
3166 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3168 // Conservatively forward AL on x86_64, since it might be used for varargs.
3169 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3170 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3171 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3174 // Copy all forwards from physical to virtual registers.
3175 for (ForwardedRegister &F : Forwards) {
3176 // FIXME: Can we use a less constrained schedule?
3177 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3178 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3179 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3183 // Some CCs need callee pop.
3184 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3185 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3186 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3187 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3188 // X86 interrupts must pop the error code (and the alignment padding) if
3190 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3192 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3193 // If this is an sret function, the return should pop the hidden pointer.
3194 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3195 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3196 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3197 FuncInfo->setBytesToPopOnReturn(4);
3201 // RegSaveFrameIndex is X86-64 only.
3202 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3203 if (CallConv == CallingConv::X86_FastCall ||
3204 CallConv == CallingConv::X86_ThisCall)
3205 // fastcc functions can't have varargs.
3206 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3209 FuncInfo->setArgumentStackSize(StackSize);
3211 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3212 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3213 if (Personality == EHPersonality::CoreCLR) {
3215 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3216 // that we'd prefer this slot be allocated towards the bottom of the frame
3217 // (i.e. near the stack pointer after allocating the frame). Every
3218 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3219 // offset from the bottom of this and each funclet's frame must be the
3220 // same, so the size of funclets' (mostly empty) frames is dictated by
3221 // how far this slot is from the bottom (since they allocate just enough
3222 // space to accommodate holding this slot at the correct offset).
3223 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3224 EHInfo->PSPSymFrameIdx = PSPSymFI;
3228 if (CallConv == CallingConv::X86_RegCall ||
3229 Fn->hasFnAttribute("no_caller_saved_registers")) {
3230 const MachineRegisterInfo &MRI = MF.getRegInfo();
3231 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3232 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3238 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3239 SDValue Arg, const SDLoc &dl,
3241 const CCValAssign &VA,
3242 ISD::ArgFlagsTy Flags) const {
3243 unsigned LocMemOffset = VA.getLocMemOffset();
3244 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3245 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3247 if (Flags.isByVal())
3248 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3250 return DAG.getStore(
3251 Chain, dl, Arg, PtrOff,
3252 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3255 /// Emit a load of return address if tail call
3256 /// optimization is performed and it is required.
3257 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3258 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3259 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3260 // Adjust the Return address stack slot.
3261 EVT VT = getPointerTy(DAG.getDataLayout());
3262 OutRetAddr = getReturnAddressFrameIndex(DAG);
3264 // Load the "old" Return address.
3265 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3266 return SDValue(OutRetAddr.getNode(), 1);
3269 /// Emit a store of the return address if tail call
3270 /// optimization is performed and it is required (FPDiff!=0).
3271 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3272 SDValue Chain, SDValue RetAddrFrIdx,
3273 EVT PtrVT, unsigned SlotSize,
3274 int FPDiff, const SDLoc &dl) {
3275 // Store the return address to the appropriate stack slot.
3276 if (!FPDiff) return Chain;
3277 // Calculate the new stack slot for the return address.
3278 int NewReturnAddrFI =
3279 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3281 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3282 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3283 MachinePointerInfo::getFixedStack(
3284 DAG.getMachineFunction(), NewReturnAddrFI));
3288 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3289 /// operation of specified width.
3290 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3292 unsigned NumElems = VT.getVectorNumElements();
3293 SmallVector<int, 8> Mask;
3294 Mask.push_back(NumElems);
3295 for (unsigned i = 1; i != NumElems; ++i)
3297 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3301 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3302 SmallVectorImpl<SDValue> &InVals) const {
3303 SelectionDAG &DAG = CLI.DAG;
3305 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3306 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3307 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3308 SDValue Chain = CLI.Chain;
3309 SDValue Callee = CLI.Callee;
3310 CallingConv::ID CallConv = CLI.CallConv;
3311 bool &isTailCall = CLI.IsTailCall;
3312 bool isVarArg = CLI.IsVarArg;
3314 MachineFunction &MF = DAG.getMachineFunction();
3315 bool Is64Bit = Subtarget.is64Bit();
3316 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3317 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3318 bool IsSibcall = false;
3319 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3320 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3321 const CallInst *CI =
3322 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3323 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3324 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3325 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3327 if (CallConv == CallingConv::X86_INTR)
3328 report_fatal_error("X86 interrupts may not be called directly");
3330 if (Attr.getValueAsString() == "true")
3333 if (Subtarget.isPICStyleGOT() &&
3334 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3335 // If we are using a GOT, disable tail calls to external symbols with
3336 // default visibility. Tail calling such a symbol requires using a GOT
3337 // relocation, which forces early binding of the symbol. This breaks code
3338 // that require lazy function symbol resolution. Using musttail or
3339 // GuaranteedTailCallOpt will override this.
3340 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3341 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3342 G->getGlobal()->hasDefaultVisibility()))
3346 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3348 // Force this to be a tail call. The verifier rules are enough to ensure
3349 // that we can lower this successfully without moving the return address
3352 } else if (isTailCall) {
3353 // Check if it's really possible to do a tail call.
3354 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3355 isVarArg, SR != NotStructReturn,
3356 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3357 Outs, OutVals, Ins, DAG);
3359 // Sibcalls are automatically detected tailcalls which do not require
3361 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3368 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3369 "Var args not supported with calling convention fastcc, ghc or hipe");
3371 // Analyze operands of the call, assigning locations to each operand.
3372 SmallVector<CCValAssign, 16> ArgLocs;
3373 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3375 // Allocate shadow area for Win64.
3377 CCInfo.AllocateStack(32, 8);
3379 CCInfo.AnalyzeArguments(Outs, CC_X86);
3381 // In vectorcall calling convention a second pass is required for the HVA
3383 if (CallingConv::X86_VectorCall == CallConv) {
3384 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3387 // Get a count of how many bytes are to be pushed on the stack.
3388 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3390 // This is a sibcall. The memory operands are available in caller's
3391 // own caller's stack.
3393 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3394 canGuaranteeTCO(CallConv))
3395 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3398 if (isTailCall && !IsSibcall && !IsMustTail) {
3399 // Lower arguments at fp - stackoffset + fpdiff.
3400 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3402 FPDiff = NumBytesCallerPushed - NumBytes;
3404 // Set the delta of movement of the returnaddr stackslot.
3405 // But only set if delta is greater than previous delta.
3406 if (FPDiff < X86Info->getTCReturnAddrDelta())
3407 X86Info->setTCReturnAddrDelta(FPDiff);
3410 unsigned NumBytesToPush = NumBytes;
3411 unsigned NumBytesToPop = NumBytes;
3413 // If we have an inalloca argument, all stack space has already been allocated
3414 // for us and be right at the top of the stack. We don't support multiple
3415 // arguments passed in memory when using inalloca.
3416 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3418 if (!ArgLocs.back().isMemLoc())
3419 report_fatal_error("cannot use inalloca attribute on a register "
3421 if (ArgLocs.back().getLocMemOffset() != 0)
3422 report_fatal_error("any parameter with the inalloca attribute must be "
3423 "the only memory argument");
3427 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3428 NumBytes - NumBytesToPush, dl);
3430 SDValue RetAddrFrIdx;
3431 // Load return address for tail calls.
3432 if (isTailCall && FPDiff)
3433 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3434 Is64Bit, FPDiff, dl);
3436 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3437 SmallVector<SDValue, 8> MemOpChains;
3440 // The next loop assumes that the locations are in the same order of the
3442 assert(isSortedByValueNo(ArgLocs) &&
3443 "Argument Location list must be sorted before lowering");
3445 // Walk the register/memloc assignments, inserting copies/loads. In the case
3446 // of tail call optimization arguments are handle later.
3447 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3448 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3450 assert(OutIndex < Outs.size() && "Invalid Out index");
3451 // Skip inalloca arguments, they have already been written.
3452 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3453 if (Flags.isInAlloca())
3456 CCValAssign &VA = ArgLocs[I];
3457 EVT RegVT = VA.getLocVT();
3458 SDValue Arg = OutVals[OutIndex];
3459 bool isByVal = Flags.isByVal();
3461 // Promote the value if needed.
3462 switch (VA.getLocInfo()) {
3463 default: llvm_unreachable("Unknown loc info!");
3464 case CCValAssign::Full: break;
3465 case CCValAssign::SExt:
3466 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3468 case CCValAssign::ZExt:
3469 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3471 case CCValAssign::AExt:
3472 if (Arg.getValueType().isVector() &&
3473 Arg.getValueType().getVectorElementType() == MVT::i1)
3474 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3475 else if (RegVT.is128BitVector()) {
3476 // Special case: passing MMX values in XMM registers.
3477 Arg = DAG.getBitcast(MVT::i64, Arg);
3478 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3479 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3481 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3483 case CCValAssign::BCvt:
3484 Arg = DAG.getBitcast(RegVT, Arg);
3486 case CCValAssign::Indirect: {
3487 // Store the argument.
3488 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3489 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3490 Chain = DAG.getStore(
3491 Chain, dl, Arg, SpillSlot,
3492 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3498 if (VA.needsCustom()) {
3499 assert(VA.getValVT() == MVT::v64i1 &&
3500 "Currently the only custom case is when we split v64i1 to 2 regs");
3501 // Split v64i1 value into two registers
3502 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3504 } else if (VA.isRegLoc()) {
3505 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3506 if (isVarArg && IsWin64) {
3507 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3508 // shadow reg if callee is a varargs function.
3509 unsigned ShadowReg = 0;
3510 switch (VA.getLocReg()) {
3511 case X86::XMM0: ShadowReg = X86::RCX; break;
3512 case X86::XMM1: ShadowReg = X86::RDX; break;
3513 case X86::XMM2: ShadowReg = X86::R8; break;
3514 case X86::XMM3: ShadowReg = X86::R9; break;
3517 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3519 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3520 assert(VA.isMemLoc());
3521 if (!StackPtr.getNode())
3522 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3523 getPointerTy(DAG.getDataLayout()));
3524 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3525 dl, DAG, VA, Flags));
3529 if (!MemOpChains.empty())
3530 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3532 if (Subtarget.isPICStyleGOT()) {
3533 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3536 RegsToPass.push_back(std::make_pair(
3537 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3538 getPointerTy(DAG.getDataLayout()))));
3540 // If we are tail calling and generating PIC/GOT style code load the
3541 // address of the callee into ECX. The value in ecx is used as target of
3542 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3543 // for tail calls on PIC/GOT architectures. Normally we would just put the
3544 // address of GOT into ebx and then call target@PLT. But for tail calls
3545 // ebx would be restored (since ebx is callee saved) before jumping to the
3548 // Note: The actual moving to ECX is done further down.
3549 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3550 if (G && !G->getGlobal()->hasLocalLinkage() &&
3551 G->getGlobal()->hasDefaultVisibility())
3552 Callee = LowerGlobalAddress(Callee, DAG);
3553 else if (isa<ExternalSymbolSDNode>(Callee))
3554 Callee = LowerExternalSymbol(Callee, DAG);
3558 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3559 // From AMD64 ABI document:
3560 // For calls that may call functions that use varargs or stdargs
3561 // (prototype-less calls or calls to functions containing ellipsis (...) in
3562 // the declaration) %al is used as hidden argument to specify the number
3563 // of SSE registers used. The contents of %al do not need to match exactly
3564 // the number of registers, but must be an ubound on the number of SSE
3565 // registers used and is in the range 0 - 8 inclusive.
3567 // Count the number of XMM registers allocated.
3568 static const MCPhysReg XMMArgRegs[] = {
3569 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3570 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3572 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3573 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3574 && "SSE registers cannot be used when SSE is disabled");
3576 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3577 DAG.getConstant(NumXMMRegs, dl,
3581 if (isVarArg && IsMustTail) {
3582 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3583 for (const auto &F : Forwards) {
3584 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3585 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3589 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3590 // don't need this because the eligibility check rejects calls that require
3591 // shuffling arguments passed in memory.
3592 if (!IsSibcall && isTailCall) {
3593 // Force all the incoming stack arguments to be loaded from the stack
3594 // before any new outgoing arguments are stored to the stack, because the
3595 // outgoing stack slots may alias the incoming argument stack slots, and
3596 // the alias isn't otherwise explicit. This is slightly more conservative
3597 // than necessary, because it means that each store effectively depends
3598 // on every argument instead of just those arguments it would clobber.
3599 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3601 SmallVector<SDValue, 8> MemOpChains2;
3604 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3606 CCValAssign &VA = ArgLocs[I];
3608 if (VA.isRegLoc()) {
3609 if (VA.needsCustom()) {
3610 assert((CallConv == CallingConv::X86_RegCall) &&
3611 "Expecting custom case only in regcall calling convention");
3612 // This means that we are in special case where one argument was
3613 // passed through two register locations - Skip the next location
3620 assert(VA.isMemLoc());
3621 SDValue Arg = OutVals[OutsIndex];
3622 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3623 // Skip inalloca arguments. They don't require any work.
3624 if (Flags.isInAlloca())
3626 // Create frame index.
3627 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3628 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3629 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3630 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3632 if (Flags.isByVal()) {
3633 // Copy relative to framepointer.
3634 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3635 if (!StackPtr.getNode())
3636 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3637 getPointerTy(DAG.getDataLayout()));
3638 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3641 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3645 // Store relative to framepointer.
3646 MemOpChains2.push_back(DAG.getStore(
3647 ArgChain, dl, Arg, FIN,
3648 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3652 if (!MemOpChains2.empty())
3653 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3655 // Store the return address to the appropriate stack slot.
3656 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3657 getPointerTy(DAG.getDataLayout()),
3658 RegInfo->getSlotSize(), FPDiff, dl);
3661 // Build a sequence of copy-to-reg nodes chained together with token chain
3662 // and flag operands which copy the outgoing args into registers.
3664 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3665 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3666 RegsToPass[i].second, InFlag);
3667 InFlag = Chain.getValue(1);
3670 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3671 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3672 // In the 64-bit large code model, we have to make all calls
3673 // through a register, since the call instruction's 32-bit
3674 // pc-relative offset may not be large enough to hold the whole
3676 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3677 // If the callee is a GlobalAddress node (quite common, every direct call
3678 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3680 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3682 // We should use extra load for direct calls to dllimported functions in
3684 const GlobalValue *GV = G->getGlobal();
3685 if (!GV->hasDLLImportStorageClass()) {
3686 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3688 Callee = DAG.getTargetGlobalAddress(
3689 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3691 if (OpFlags == X86II::MO_GOTPCREL) {
3693 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3694 getPointerTy(DAG.getDataLayout()), Callee);
3695 // Add extra indirection
3696 Callee = DAG.getLoad(
3697 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3698 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3701 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3702 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3703 unsigned char OpFlags =
3704 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3706 Callee = DAG.getTargetExternalSymbol(
3707 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3708 } else if (Subtarget.isTarget64BitILP32() &&
3709 Callee->getValueType(0) == MVT::i32) {
3710 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3711 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3714 // Returns a chain & a flag for retval copy to use.
3715 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3716 SmallVector<SDValue, 8> Ops;
3718 if (!IsSibcall && isTailCall) {
3719 Chain = DAG.getCALLSEQ_END(Chain,
3720 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3721 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3722 InFlag = Chain.getValue(1);
3725 Ops.push_back(Chain);
3726 Ops.push_back(Callee);
3729 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3731 // Add argument registers to the end of the list so that they are known live
3733 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3734 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3735 RegsToPass[i].second.getValueType()));
3737 // Add a register mask operand representing the call-preserved registers.
3738 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3739 // set X86_INTR calling convention because it has the same CSR mask
3740 // (same preserved registers).
3741 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3742 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3743 assert(Mask && "Missing call preserved mask for calling convention");
3745 // If this is an invoke in a 32-bit function using a funclet-based
3746 // personality, assume the function clobbers all registers. If an exception
3747 // is thrown, the runtime will not restore CSRs.
3748 // FIXME: Model this more precisely so that we can register allocate across
3749 // the normal edge and spill and fill across the exceptional edge.
3750 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3751 const Function *CallerFn = MF.getFunction();
3752 EHPersonality Pers =
3753 CallerFn->hasPersonalityFn()
3754 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3755 : EHPersonality::Unknown;
3756 if (isFuncletEHPersonality(Pers))
3757 Mask = RegInfo->getNoPreservedMask();
3760 // Define a new register mask from the existing mask.
3761 uint32_t *RegMask = nullptr;
3763 // In some calling conventions we need to remove the used physical registers
3764 // from the reg mask.
3765 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3766 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3768 // Allocate a new Reg Mask and copy Mask.
3769 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3770 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3771 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3773 // Make sure all sub registers of the argument registers are reset
3775 for (auto const &RegPair : RegsToPass)
3776 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3777 SubRegs.isValid(); ++SubRegs)
3778 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3780 // Create the RegMask Operand according to our updated mask.
3781 Ops.push_back(DAG.getRegisterMask(RegMask));
3783 // Create the RegMask Operand according to the static mask.
3784 Ops.push_back(DAG.getRegisterMask(Mask));
3787 if (InFlag.getNode())
3788 Ops.push_back(InFlag);
3792 //// If this is the first return lowered for this function, add the regs
3793 //// to the liveout set for the function.
3794 // This isn't right, although it's probably harmless on x86; liveouts
3795 // should be computed from returns not tail calls. Consider a void
3796 // function making a tail call to a function returning int.
3797 MF.getFrameInfo().setHasTailCall();
3798 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3801 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3802 InFlag = Chain.getValue(1);
3804 // Create the CALLSEQ_END node.
3805 unsigned NumBytesForCalleeToPop;
3806 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3807 DAG.getTarget().Options.GuaranteedTailCallOpt))
3808 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3809 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3810 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3811 SR == StackStructReturn)
3812 // If this is a call to a struct-return function, the callee
3813 // pops the hidden struct pointer, so we have to push it back.
3814 // This is common for Darwin/X86, Linux & Mingw32 targets.
3815 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3816 NumBytesForCalleeToPop = 4;
3818 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3820 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3821 // No need to reset the stack after the call if the call doesn't return. To
3822 // make the MI verify, we'll pretend the callee does it for us.
3823 NumBytesForCalleeToPop = NumBytes;
3826 // Returns a flag for retval copy to use.
3828 Chain = DAG.getCALLSEQ_END(Chain,
3829 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3830 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3833 InFlag = Chain.getValue(1);
3836 // Handle result values, copying them out of physregs into vregs that we
3838 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3842 //===----------------------------------------------------------------------===//
3843 // Fast Calling Convention (tail call) implementation
3844 //===----------------------------------------------------------------------===//
3846 // Like std call, callee cleans arguments, convention except that ECX is
3847 // reserved for storing the tail called function address. Only 2 registers are
3848 // free for argument passing (inreg). Tail call optimization is performed
3850 // * tailcallopt is enabled
3851 // * caller/callee are fastcc
3852 // On X86_64 architecture with GOT-style position independent code only local
3853 // (within module) calls are supported at the moment.
3854 // To keep the stack aligned according to platform abi the function
3855 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3856 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3857 // If a tail called function callee has more arguments than the caller the
3858 // caller needs to make sure that there is room to move the RETADDR to. This is
3859 // achieved by reserving an area the size of the argument delta right after the
3860 // original RETADDR, but before the saved framepointer or the spilled registers
3861 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3873 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3876 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3877 SelectionDAG& DAG) const {
3878 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3879 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3880 unsigned StackAlignment = TFI.getStackAlignment();
3881 uint64_t AlignMask = StackAlignment - 1;
3882 int64_t Offset = StackSize;
3883 unsigned SlotSize = RegInfo->getSlotSize();
3884 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3885 // Number smaller than 12 so just add the difference.
3886 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3888 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3889 Offset = ((~AlignMask) & Offset) + StackAlignment +
3890 (StackAlignment-SlotSize);
3895 /// Return true if the given stack call argument is already available in the
3896 /// same position (relatively) of the caller's incoming argument stack.
3898 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3899 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3900 const X86InstrInfo *TII, const CCValAssign &VA) {
3901 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3904 // Look through nodes that don't alter the bits of the incoming value.
3905 unsigned Op = Arg.getOpcode();
3906 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3907 Arg = Arg.getOperand(0);
3910 if (Op == ISD::TRUNCATE) {
3911 const SDValue &TruncInput = Arg.getOperand(0);
3912 if (TruncInput.getOpcode() == ISD::AssertZext &&
3913 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3914 Arg.getValueType()) {
3915 Arg = TruncInput.getOperand(0);
3923 if (Arg.getOpcode() == ISD::CopyFromReg) {
3924 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3925 if (!TargetRegisterInfo::isVirtualRegister(VR))
3927 MachineInstr *Def = MRI->getVRegDef(VR);
3930 if (!Flags.isByVal()) {
3931 if (!TII->isLoadFromStackSlot(*Def, FI))
3934 unsigned Opcode = Def->getOpcode();
3935 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3936 Opcode == X86::LEA64_32r) &&
3937 Def->getOperand(1).isFI()) {
3938 FI = Def->getOperand(1).getIndex();
3939 Bytes = Flags.getByValSize();
3943 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3944 if (Flags.isByVal())
3945 // ByVal argument is passed in as a pointer but it's now being
3946 // dereferenced. e.g.
3947 // define @foo(%struct.X* %A) {
3948 // tail call @bar(%struct.X* byval %A)
3951 SDValue Ptr = Ld->getBasePtr();
3952 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3955 FI = FINode->getIndex();
3956 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3957 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3958 FI = FINode->getIndex();
3959 Bytes = Flags.getByValSize();
3963 assert(FI != INT_MAX);
3964 if (!MFI.isFixedObjectIndex(FI))
3967 if (Offset != MFI.getObjectOffset(FI))
3970 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3971 // If the argument location is wider than the argument type, check that any
3972 // extension flags match.
3973 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3974 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3979 return Bytes == MFI.getObjectSize(FI);
3982 /// Check whether the call is eligible for tail call optimization. Targets
3983 /// that want to do tail call optimization should implement this function.
3984 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3985 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3986 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3987 const SmallVectorImpl<ISD::OutputArg> &Outs,
3988 const SmallVectorImpl<SDValue> &OutVals,
3989 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3990 if (!mayTailCallThisCC(CalleeCC))
3993 // If -tailcallopt is specified, make fastcc functions tail-callable.
3994 MachineFunction &MF = DAG.getMachineFunction();
3995 const Function *CallerF = MF.getFunction();
3997 // If the function return type is x86_fp80 and the callee return type is not,
3998 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3999 // perform a tailcall optimization here.
4000 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4003 CallingConv::ID CallerCC = CallerF->getCallingConv();
4004 bool CCMatch = CallerCC == CalleeCC;
4005 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4006 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4008 // Win64 functions have extra shadow space for argument homing. Don't do the
4009 // sibcall if the caller and callee have mismatched expectations for this
4011 if (IsCalleeWin64 != IsCallerWin64)
4014 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4015 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4020 // Look for obvious safe cases to perform tail call optimization that do not
4021 // require ABI changes. This is what gcc calls sibcall.
4023 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4024 // emit a special epilogue.
4025 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4026 if (RegInfo->needsStackRealignment(MF))
4029 // Also avoid sibcall optimization if either caller or callee uses struct
4030 // return semantics.
4031 if (isCalleeStructRet || isCallerStructRet)
4034 // Do not sibcall optimize vararg calls unless all arguments are passed via
4036 LLVMContext &C = *DAG.getContext();
4037 if (isVarArg && !Outs.empty()) {
4038 // Optimizing for varargs on Win64 is unlikely to be safe without
4039 // additional testing.
4040 if (IsCalleeWin64 || IsCallerWin64)
4043 SmallVector<CCValAssign, 16> ArgLocs;
4044 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4046 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4047 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4048 if (!ArgLocs[i].isRegLoc())
4052 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4053 // stack. Therefore, if it's not used by the call it is not safe to optimize
4054 // this into a sibcall.
4055 bool Unused = false;
4056 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4063 SmallVector<CCValAssign, 16> RVLocs;
4064 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4065 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4066 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4067 CCValAssign &VA = RVLocs[i];
4068 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4073 // Check that the call results are passed in the same way.
4074 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4075 RetCC_X86, RetCC_X86))
4077 // The callee has to preserve all registers the caller needs to preserve.
4078 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4079 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4081 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4082 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4086 unsigned StackArgsSize = 0;
4088 // If the callee takes no arguments then go on to check the results of the
4090 if (!Outs.empty()) {
4091 // Check if stack adjustment is needed. For now, do not do this if any
4092 // argument is passed on the stack.
4093 SmallVector<CCValAssign, 16> ArgLocs;
4094 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4096 // Allocate shadow area for Win64
4098 CCInfo.AllocateStack(32, 8);
4100 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4101 StackArgsSize = CCInfo.getNextStackOffset();
4103 if (CCInfo.getNextStackOffset()) {
4104 // Check if the arguments are already laid out in the right way as
4105 // the caller's fixed stack objects.
4106 MachineFrameInfo &MFI = MF.getFrameInfo();
4107 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4108 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4109 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4110 CCValAssign &VA = ArgLocs[i];
4111 SDValue Arg = OutVals[i];
4112 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4113 if (VA.getLocInfo() == CCValAssign::Indirect)
4115 if (!VA.isRegLoc()) {
4116 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4123 bool PositionIndependent = isPositionIndependent();
4124 // If the tailcall address may be in a register, then make sure it's
4125 // possible to register allocate for it. In 32-bit, the call address can
4126 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4127 // callee-saved registers are restored. These happen to be the same
4128 // registers used to pass 'inreg' arguments so watch out for those.
4129 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4130 !isa<ExternalSymbolSDNode>(Callee)) ||
4131 PositionIndependent)) {
4132 unsigned NumInRegs = 0;
4133 // In PIC we need an extra register to formulate the address computation
4135 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4137 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4138 CCValAssign &VA = ArgLocs[i];
4141 unsigned Reg = VA.getLocReg();
4144 case X86::EAX: case X86::EDX: case X86::ECX:
4145 if (++NumInRegs == MaxInRegs)
4152 const MachineRegisterInfo &MRI = MF.getRegInfo();
4153 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4157 bool CalleeWillPop =
4158 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4159 MF.getTarget().Options.GuaranteedTailCallOpt);
4161 if (unsigned BytesToPop =
4162 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4163 // If we have bytes to pop, the callee must pop them.
4164 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4165 if (!CalleePopMatches)
4167 } else if (CalleeWillPop && StackArgsSize > 0) {
4168 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4176 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4177 const TargetLibraryInfo *libInfo) const {
4178 return X86::createFastISel(funcInfo, libInfo);
4181 //===----------------------------------------------------------------------===//
4182 // Other Lowering Hooks
4183 //===----------------------------------------------------------------------===//
4185 static bool MayFoldLoad(SDValue Op) {
4186 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4189 static bool MayFoldIntoStore(SDValue Op) {
4190 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4193 static bool MayFoldIntoZeroExtend(SDValue Op) {
4194 if (Op.hasOneUse()) {
4195 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4196 return (ISD::ZERO_EXTEND == Opcode);
4201 static bool isTargetShuffle(unsigned Opcode) {
4203 default: return false;
4204 case X86ISD::BLENDI:
4205 case X86ISD::PSHUFB:
4206 case X86ISD::PSHUFD:
4207 case X86ISD::PSHUFHW:
4208 case X86ISD::PSHUFLW:
4210 case X86ISD::INSERTPS:
4211 case X86ISD::PALIGNR:
4212 case X86ISD::VSHLDQ:
4213 case X86ISD::VSRLDQ:
4214 case X86ISD::MOVLHPS:
4215 case X86ISD::MOVLHPD:
4216 case X86ISD::MOVHLPS:
4217 case X86ISD::MOVLPS:
4218 case X86ISD::MOVLPD:
4219 case X86ISD::MOVSHDUP:
4220 case X86ISD::MOVSLDUP:
4221 case X86ISD::MOVDDUP:
4224 case X86ISD::UNPCKL:
4225 case X86ISD::UNPCKH:
4226 case X86ISD::VBROADCAST:
4227 case X86ISD::VPERMILPI:
4228 case X86ISD::VPERMILPV:
4229 case X86ISD::VPERM2X128:
4230 case X86ISD::VPERMIL2:
4231 case X86ISD::VPERMI:
4232 case X86ISD::VPPERM:
4233 case X86ISD::VPERMV:
4234 case X86ISD::VPERMV3:
4235 case X86ISD::VPERMIV3:
4236 case X86ISD::VZEXT_MOVL:
4241 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4243 default: return false;
4245 case X86ISD::PSHUFB:
4246 case X86ISD::VPERMILPV:
4247 case X86ISD::VPERMIL2:
4248 case X86ISD::VPPERM:
4249 case X86ISD::VPERMV:
4250 case X86ISD::VPERMV3:
4251 case X86ISD::VPERMIV3:
4253 // 'Faux' Target Shuffles.
4260 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4261 MachineFunction &MF = DAG.getMachineFunction();
4262 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4263 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4264 int ReturnAddrIndex = FuncInfo->getRAIndex();
4266 if (ReturnAddrIndex == 0) {
4267 // Set up a frame object for the return address.
4268 unsigned SlotSize = RegInfo->getSlotSize();
4269 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4272 FuncInfo->setRAIndex(ReturnAddrIndex);
4275 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4278 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4279 bool hasSymbolicDisplacement) {
4280 // Offset should fit into 32 bit immediate field.
4281 if (!isInt<32>(Offset))
4284 // If we don't have a symbolic displacement - we don't have any extra
4286 if (!hasSymbolicDisplacement)
4289 // FIXME: Some tweaks might be needed for medium code model.
4290 if (M != CodeModel::Small && M != CodeModel::Kernel)
4293 // For small code model we assume that latest object is 16MB before end of 31
4294 // bits boundary. We may also accept pretty large negative constants knowing
4295 // that all objects are in the positive half of address space.
4296 if (M == CodeModel::Small && Offset < 16*1024*1024)
4299 // For kernel code model we know that all object resist in the negative half
4300 // of 32bits address space. We may not accept negative offsets, since they may
4301 // be just off and we may accept pretty large positive ones.
4302 if (M == CodeModel::Kernel && Offset >= 0)
4308 /// Determines whether the callee is required to pop its own arguments.
4309 /// Callee pop is necessary to support tail calls.
4310 bool X86::isCalleePop(CallingConv::ID CallingConv,
4311 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4312 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4313 // can guarantee TCO.
4314 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4317 switch (CallingConv) {
4320 case CallingConv::X86_StdCall:
4321 case CallingConv::X86_FastCall:
4322 case CallingConv::X86_ThisCall:
4323 case CallingConv::X86_VectorCall:
4328 /// \brief Return true if the condition is an unsigned comparison operation.
4329 static bool isX86CCUnsigned(unsigned X86CC) {
4332 llvm_unreachable("Invalid integer condition!");
4348 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4349 switch (SetCCOpcode) {
4350 default: llvm_unreachable("Invalid integer condition!");
4351 case ISD::SETEQ: return X86::COND_E;
4352 case ISD::SETGT: return X86::COND_G;
4353 case ISD::SETGE: return X86::COND_GE;
4354 case ISD::SETLT: return X86::COND_L;
4355 case ISD::SETLE: return X86::COND_LE;
4356 case ISD::SETNE: return X86::COND_NE;
4357 case ISD::SETULT: return X86::COND_B;
4358 case ISD::SETUGT: return X86::COND_A;
4359 case ISD::SETULE: return X86::COND_BE;
4360 case ISD::SETUGE: return X86::COND_AE;
4364 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4365 /// condition code, returning the condition code and the LHS/RHS of the
4366 /// comparison to make.
4367 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4368 bool isFP, SDValue &LHS, SDValue &RHS,
4369 SelectionDAG &DAG) {
4371 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4372 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4373 // X > -1 -> X == 0, jump !sign.
4374 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4375 return X86::COND_NS;
4377 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4378 // X < 0 -> X == 0, jump on sign.
4381 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4383 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4384 return X86::COND_LE;
4388 return TranslateIntegerX86CC(SetCCOpcode);
4391 // First determine if it is required or is profitable to flip the operands.
4393 // If LHS is a foldable load, but RHS is not, flip the condition.
4394 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4395 !ISD::isNON_EXTLoad(RHS.getNode())) {
4396 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4397 std::swap(LHS, RHS);
4400 switch (SetCCOpcode) {
4406 std::swap(LHS, RHS);
4410 // On a floating point condition, the flags are set as follows:
4412 // 0 | 0 | 0 | X > Y
4413 // 0 | 0 | 1 | X < Y
4414 // 1 | 0 | 0 | X == Y
4415 // 1 | 1 | 1 | unordered
4416 switch (SetCCOpcode) {
4417 default: llvm_unreachable("Condcode should be pre-legalized away");
4419 case ISD::SETEQ: return X86::COND_E;
4420 case ISD::SETOLT: // flipped
4422 case ISD::SETGT: return X86::COND_A;
4423 case ISD::SETOLE: // flipped
4425 case ISD::SETGE: return X86::COND_AE;
4426 case ISD::SETUGT: // flipped
4428 case ISD::SETLT: return X86::COND_B;
4429 case ISD::SETUGE: // flipped
4431 case ISD::SETLE: return X86::COND_BE;
4433 case ISD::SETNE: return X86::COND_NE;
4434 case ISD::SETUO: return X86::COND_P;
4435 case ISD::SETO: return X86::COND_NP;
4437 case ISD::SETUNE: return X86::COND_INVALID;
4441 /// Is there a floating point cmov for the specific X86 condition code?
4442 /// Current x86 isa includes the following FP cmov instructions:
4443 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4444 static bool hasFPCMov(unsigned X86CC) {
4461 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4463 unsigned Intrinsic) const {
4465 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4469 Info.opc = ISD::INTRINSIC_W_CHAIN;
4470 Info.readMem = false;
4471 Info.writeMem = false;
4475 switch (IntrData->Type) {
4476 case EXPAND_FROM_MEM: {
4477 Info.ptrVal = I.getArgOperand(0);
4478 Info.memVT = MVT::getVT(I.getType());
4480 Info.readMem = true;
4483 case COMPRESS_TO_MEM: {
4484 Info.ptrVal = I.getArgOperand(0);
4485 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4487 Info.writeMem = true;
4490 case TRUNCATE_TO_MEM_VI8:
4491 case TRUNCATE_TO_MEM_VI16:
4492 case TRUNCATE_TO_MEM_VI32: {
4493 Info.ptrVal = I.getArgOperand(0);
4494 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4495 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4496 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4498 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4499 ScalarVT = MVT::i16;
4500 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4501 ScalarVT = MVT::i32;
4503 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4505 Info.writeMem = true;
4515 /// Returns true if the target can instruction select the
4516 /// specified FP immediate natively. If false, the legalizer will
4517 /// materialize the FP immediate as a load from a constant pool.
4518 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4519 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4520 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4526 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4527 ISD::LoadExtType ExtTy,
4529 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4530 // relocation target a movq or addq instruction: don't let the load shrink.
4531 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4532 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4533 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4534 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4538 /// \brief Returns true if it is beneficial to convert a load of a constant
4539 /// to just the constant itself.
4540 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4542 assert(Ty->isIntegerTy());
4544 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4545 if (BitSize == 0 || BitSize > 64)
4550 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4551 unsigned Index) const {
4552 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4555 return (Index == 0 || Index == ResVT.getVectorNumElements());
4558 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4559 // Speculate cttz only if we can directly use TZCNT.
4560 return Subtarget.hasBMI();
4563 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4564 // Speculate ctlz only if we can directly use LZCNT.
4565 return Subtarget.hasLZCNT();
4568 bool X86TargetLowering::isCtlzFast() const {
4569 return Subtarget.hasFastLZCNT();
4572 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4573 const Instruction &AndI) const {
4577 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4578 if (!Subtarget.hasBMI())
4581 // There are only 32-bit and 64-bit forms for 'andn'.
4582 EVT VT = Y.getValueType();
4583 if (VT != MVT::i32 && VT != MVT::i64)
4589 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4590 MVT VT = MVT::getIntegerVT(NumBits);
4591 if (isTypeLegal(VT))
4594 // PMOVMSKB can handle this.
4595 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4598 // VPMOVMSKB can handle this.
4599 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4602 // TODO: Allow 64-bit type for 32-bit target.
4603 // TODO: 512-bit types should be allowed, but make sure that those
4604 // cases are handled in combineVectorSizedSetCCEquality().
4606 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4609 /// Val is the undef sentinel value or equal to the specified value.
4610 static bool isUndefOrEqual(int Val, int CmpVal) {
4611 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4614 /// Val is either the undef or zero sentinel value.
4615 static bool isUndefOrZero(int Val) {
4616 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4619 /// Return true if every element in Mask, beginning
4620 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4621 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4622 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4623 if (Mask[i] != SM_SentinelUndef)
4628 /// Return true if Val is undef or if its value falls within the
4629 /// specified range (L, H].
4630 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4631 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4634 /// Return true if every element in Mask is undef or if its value
4635 /// falls within the specified range (L, H].
4636 static bool isUndefOrInRange(ArrayRef<int> Mask,
4639 if (!isUndefOrInRange(M, Low, Hi))
4644 /// Return true if Val is undef, zero or if its value falls within the
4645 /// specified range (L, H].
4646 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4647 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4650 /// Return true if every element in Mask is undef, zero or if its value
4651 /// falls within the specified range (L, H].
4652 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4654 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4659 /// Return true if every element in Mask, beginning
4660 /// from position Pos and ending in Pos+Size, falls within the specified
4661 /// sequential range (Low, Low+Size]. or is undef.
4662 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4663 unsigned Pos, unsigned Size, int Low) {
4664 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4665 if (!isUndefOrEqual(Mask[i], Low))
4670 /// Return true if every element in Mask, beginning
4671 /// from position Pos and ending in Pos+Size, falls within the specified
4672 /// sequential range (Low, Low+Size], or is undef or is zero.
4673 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4674 unsigned Size, int Low) {
4675 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4676 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4681 /// Return true if every element in Mask, beginning
4682 /// from position Pos and ending in Pos+Size is undef or is zero.
4683 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4685 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4686 if (!isUndefOrZero(Mask[i]))
4691 /// \brief Helper function to test whether a shuffle mask could be
4692 /// simplified by widening the elements being shuffled.
4694 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4695 /// leaves it in an unspecified state.
4697 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4698 /// shuffle masks. The latter have the special property of a '-2' representing
4699 /// a zero-ed lane of a vector.
4700 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4701 SmallVectorImpl<int> &WidenedMask) {
4702 WidenedMask.assign(Mask.size() / 2, 0);
4703 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4705 int M1 = Mask[i + 1];
4707 // If both elements are undef, its trivial.
4708 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4709 WidenedMask[i / 2] = SM_SentinelUndef;
4713 // Check for an undef mask and a mask value properly aligned to fit with
4714 // a pair of values. If we find such a case, use the non-undef mask's value.
4715 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4716 WidenedMask[i / 2] = M1 / 2;
4719 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4720 WidenedMask[i / 2] = M0 / 2;
4724 // When zeroing, we need to spread the zeroing across both lanes to widen.
4725 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4726 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4727 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4728 WidenedMask[i / 2] = SM_SentinelZero;
4734 // Finally check if the two mask values are adjacent and aligned with
4736 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4737 WidenedMask[i / 2] = M0 / 2;
4741 // Otherwise we can't safely widen the elements used in this shuffle.
4744 assert(WidenedMask.size() == Mask.size() / 2 &&
4745 "Incorrect size of mask after widening the elements!");
4750 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4751 /// mask index with the scaled sequential indices for an equivalent narrowed
4752 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4754 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4755 SmallVectorImpl<int> &ScaledMask) {
4756 assert(0 < Scale && "Unexpected scaling factor");
4757 int NumElts = Mask.size();
4758 ScaledMask.assign(NumElts * Scale, -1);
4760 for (int i = 0; i != NumElts; ++i) {
4763 // Repeat sentinel values in every mask element.
4765 for (int s = 0; s != Scale; ++s)
4766 ScaledMask[(Scale * i) + s] = M;
4770 // Scale mask element and increment across each mask element.
4771 for (int s = 0; s != Scale; ++s)
4772 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4776 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4777 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4778 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4779 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4780 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4783 // The index should be aligned on a vecWidth-bit boundary.
4784 uint64_t Index = N->getConstantOperandVal(1);
4785 MVT VT = N->getSimpleValueType(0);
4786 unsigned ElSize = VT.getScalarSizeInBits();
4787 return (Index * ElSize) % vecWidth == 0;
4790 /// Return true if the specified INSERT_SUBVECTOR
4791 /// operand specifies a subvector insert that is suitable for input to
4792 /// insertion of 128 or 256-bit subvectors
4793 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4794 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4795 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4798 // The index should be aligned on a vecWidth-bit boundary.
4799 uint64_t Index = N->getConstantOperandVal(2);
4800 MVT VT = N->getSimpleValueType(0);
4801 unsigned ElSize = VT.getScalarSizeInBits();
4802 return (Index * ElSize) % vecWidth == 0;
4805 bool X86::isVINSERT128Index(SDNode *N) {
4806 return isVINSERTIndex(N, 128);
4809 bool X86::isVINSERT256Index(SDNode *N) {
4810 return isVINSERTIndex(N, 256);
4813 bool X86::isVEXTRACT128Index(SDNode *N) {
4814 return isVEXTRACTIndex(N, 128);
4817 bool X86::isVEXTRACT256Index(SDNode *N) {
4818 return isVEXTRACTIndex(N, 256);
4821 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4822 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4823 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4824 "Illegal extract subvector for VEXTRACT");
4826 uint64_t Index = N->getConstantOperandVal(1);
4827 MVT VecVT = N->getOperand(0).getSimpleValueType();
4828 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4829 return Index / NumElemsPerChunk;
4832 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4833 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4834 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4835 "Illegal insert subvector for VINSERT");
4837 uint64_t Index = N->getConstantOperandVal(2);
4838 MVT VecVT = N->getSimpleValueType(0);
4839 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4840 return Index / NumElemsPerChunk;
4843 /// Return the appropriate immediate to extract the specified
4844 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4845 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4846 return getExtractVEXTRACTImmediate(N, 128);
4849 /// Return the appropriate immediate to extract the specified
4850 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4851 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4852 return getExtractVEXTRACTImmediate(N, 256);
4855 /// Return the appropriate immediate to insert at the specified
4856 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4857 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4858 return getInsertVINSERTImmediate(N, 128);
4861 /// Return the appropriate immediate to insert at the specified
4862 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4863 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4864 return getInsertVINSERTImmediate(N, 256);
4867 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4868 bool X86::isZeroNode(SDValue Elt) {
4869 return isNullConstant(Elt) || isNullFPConstant(Elt);
4872 // Build a vector of constants.
4873 // Use an UNDEF node if MaskElt == -1.
4874 // Split 64-bit constants in the 32-bit mode.
4875 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4876 const SDLoc &dl, bool IsMask = false) {
4878 SmallVector<SDValue, 32> Ops;
4881 MVT ConstVecVT = VT;
4882 unsigned NumElts = VT.getVectorNumElements();
4883 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4884 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4885 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4889 MVT EltVT = ConstVecVT.getVectorElementType();
4890 for (unsigned i = 0; i < NumElts; ++i) {
4891 bool IsUndef = Values[i] < 0 && IsMask;
4892 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4893 DAG.getConstant(Values[i], dl, EltVT);
4894 Ops.push_back(OpNode);
4896 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4897 DAG.getConstant(0, dl, EltVT));
4899 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4901 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4905 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4906 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4907 assert(Bits.size() == Undefs.getBitWidth() &&
4908 "Unequal constant and undef arrays");
4909 SmallVector<SDValue, 32> Ops;
4912 MVT ConstVecVT = VT;
4913 unsigned NumElts = VT.getVectorNumElements();
4914 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4915 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4916 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4920 MVT EltVT = ConstVecVT.getVectorElementType();
4921 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4923 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4926 const APInt &V = Bits[i];
4927 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4929 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4930 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4931 } else if (EltVT == MVT::f32) {
4932 APFloat FV(APFloat::IEEEsingle(), V);
4933 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4934 } else if (EltVT == MVT::f64) {
4935 APFloat FV(APFloat::IEEEdouble(), V);
4936 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4938 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4942 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4943 return DAG.getBitcast(VT, ConstsNode);
4946 /// Returns a vector of specified type with all zero elements.
4947 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4948 SelectionDAG &DAG, const SDLoc &dl) {
4949 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4950 VT.getVectorElementType() == MVT::i1) &&
4951 "Unexpected vector type");
4953 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4954 // type. This ensures they get CSE'd. But if the integer type is not
4955 // available, use a floating-point +0.0 instead.
4957 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4958 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4959 } else if (VT.getVectorElementType() == MVT::i1) {
4960 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4961 "Unexpected vector type");
4962 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4963 "Unexpected vector type");
4964 Vec = DAG.getConstant(0, dl, VT);
4966 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4967 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4969 return DAG.getBitcast(VT, Vec);
4972 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4973 const SDLoc &dl, unsigned vectorWidth) {
4974 EVT VT = Vec.getValueType();
4975 EVT ElVT = VT.getVectorElementType();
4976 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4977 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4978 VT.getVectorNumElements()/Factor);
4980 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4981 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4982 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4984 // This is the index of the first element of the vectorWidth-bit chunk
4985 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4986 IdxVal &= ~(ElemsPerChunk - 1);
4988 // If the input is a buildvector just emit a smaller one.
4989 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4990 return DAG.getBuildVector(
4991 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4993 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4994 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4997 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4998 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4999 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5000 /// instructions or a simple subregister reference. Idx is an index in the
5001 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5002 /// lowering EXTRACT_VECTOR_ELT operations easier.
5003 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5004 SelectionDAG &DAG, const SDLoc &dl) {
5005 assert((Vec.getValueType().is256BitVector() ||
5006 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5007 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5010 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5011 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5012 SelectionDAG &DAG, const SDLoc &dl) {
5013 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5014 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5017 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5018 SelectionDAG &DAG, const SDLoc &dl,
5019 unsigned vectorWidth) {
5020 assert((vectorWidth == 128 || vectorWidth == 256) &&
5021 "Unsupported vector width");
5022 // Inserting UNDEF is Result
5025 EVT VT = Vec.getValueType();
5026 EVT ElVT = VT.getVectorElementType();
5027 EVT ResultVT = Result.getValueType();
5029 // Insert the relevant vectorWidth bits.
5030 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5031 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5033 // This is the index of the first element of the vectorWidth-bit chunk
5034 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5035 IdxVal &= ~(ElemsPerChunk - 1);
5037 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5038 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5041 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5042 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5043 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5044 /// simple superregister reference. Idx is an index in the 128 bits
5045 /// we want. It need not be aligned to a 128-bit boundary. That makes
5046 /// lowering INSERT_VECTOR_ELT operations easier.
5047 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5048 SelectionDAG &DAG, const SDLoc &dl) {
5049 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5050 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5053 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5054 SelectionDAG &DAG, const SDLoc &dl) {
5055 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5056 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5059 /// Insert i1-subvector to i1-vector.
5060 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5061 const X86Subtarget &Subtarget) {
5064 SDValue Vec = Op.getOperand(0);
5065 SDValue SubVec = Op.getOperand(1);
5066 SDValue Idx = Op.getOperand(2);
5068 if (!isa<ConstantSDNode>(Idx))
5071 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5072 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5075 MVT OpVT = Op.getSimpleValueType();
5076 MVT SubVecVT = SubVec.getSimpleValueType();
5077 unsigned NumElems = OpVT.getVectorNumElements();
5078 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5080 assert(IdxVal + SubVecNumElems <= NumElems &&
5081 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5082 "Unexpected index value in INSERT_SUBVECTOR");
5084 // There are 3 possible cases:
5085 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5086 // 2. Subvector should be inserted in the upper part
5087 // (IdxVal + SubVecNumElems == NumElems)
5088 // 3. Subvector should be inserted in the middle (for example v2i1
5089 // to v16i1, index 2)
5091 // extend to natively supported kshift
5092 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5093 MVT WideOpVT = OpVT;
5094 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5097 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5098 SDValue Undef = DAG.getUNDEF(WideOpVT);
5099 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5100 Undef, SubVec, ZeroIdx);
5102 // Extract sub-vector if require.
5103 auto ExtractSubVec = [&](SDValue V) {
5104 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5108 if (Vec.isUndef()) {
5110 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5111 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5114 return ExtractSubVec(WideSubVec);
5117 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5118 NumElems = WideOpVT.getVectorNumElements();
5119 unsigned ShiftLeft = NumElems - SubVecNumElems;
5120 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5121 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5122 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5123 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5124 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5125 return ExtractSubVec(Vec);
5129 // Zero lower bits of the Vec
5130 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5131 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5132 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5133 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5134 // Merge them together, SubVec should be zero extended.
5135 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5136 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5138 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5139 return ExtractSubVec(Vec);
5142 // Simple case when we put subvector in the upper part
5143 if (IdxVal + SubVecNumElems == NumElems) {
5144 // Zero upper bits of the Vec
5145 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5146 DAG.getConstant(IdxVal, dl, MVT::i8));
5147 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5148 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5149 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5150 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5151 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5152 return ExtractSubVec(Vec);
5154 // Subvector should be inserted in the middle - use shuffle
5155 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5157 SmallVector<int, 64> Mask;
5158 for (unsigned i = 0; i < NumElems; ++i)
5159 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5161 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5164 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5165 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5166 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5167 /// large BUILD_VECTORS.
5168 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5169 unsigned NumElems, SelectionDAG &DAG,
5171 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5172 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5175 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5176 unsigned NumElems, SelectionDAG &DAG,
5178 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5179 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5182 /// Returns a vector of specified type with all bits set.
5183 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5184 /// Then bitcast to their original type, ensuring they get CSE'd.
5185 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5186 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5187 "Expected a 128/256/512-bit vector type");
5189 APInt Ones = APInt::getAllOnesValue(32);
5190 unsigned NumElts = VT.getSizeInBits() / 32;
5191 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5192 return DAG.getBitcast(VT, Vec);
5195 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5196 SelectionDAG &DAG) {
5197 EVT InVT = In.getValueType();
5198 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5200 if (VT.is128BitVector() && InVT.is128BitVector())
5201 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5202 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5204 // For 256-bit vectors, we only need the lower (128-bit) input half.
5205 // For 512-bit vectors, we only need the lower input half or quarter.
5206 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5207 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5208 In = extractSubVector(In, 0, DAG, DL,
5209 std::max(128, (int)VT.getSizeInBits() / Scale));
5212 return DAG.getNode(Opc, DL, VT, In);
5215 /// Generate unpacklo/unpackhi shuffle mask.
5216 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5218 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5219 int NumElts = VT.getVectorNumElements();
5220 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5222 for (int i = 0; i < NumElts; ++i) {
5223 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5224 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5225 Pos += (Unary ? 0 : NumElts * (i % 2));
5226 Pos += (Lo ? 0 : NumEltsInLane / 2);
5227 Mask.push_back(Pos);
5231 /// Returns a vector_shuffle node for an unpackl operation.
5232 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5233 SDValue V1, SDValue V2) {
5234 SmallVector<int, 8> Mask;
5235 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5236 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5239 /// Returns a vector_shuffle node for an unpackh operation.
5240 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5241 SDValue V1, SDValue V2) {
5242 SmallVector<int, 8> Mask;
5243 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5244 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5247 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5248 /// This produces a shuffle where the low element of V2 is swizzled into the
5249 /// zero/undef vector, landing at element Idx.
5250 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5251 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5253 const X86Subtarget &Subtarget,
5254 SelectionDAG &DAG) {
5255 MVT VT = V2.getSimpleValueType();
5257 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5258 int NumElems = VT.getVectorNumElements();
5259 SmallVector<int, 16> MaskVec(NumElems);
5260 for (int i = 0; i != NumElems; ++i)
5261 // If this is the insertion idx, put the low elt of V2 here.
5262 MaskVec[i] = (i == Idx) ? NumElems : i;
5263 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5266 static SDValue peekThroughBitcasts(SDValue V) {
5267 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5268 V = V.getOperand(0);
5272 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5273 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5274 V.getOperand(0).hasOneUse())
5275 V = V.getOperand(0);
5279 static const Constant *getTargetConstantFromNode(SDValue Op) {
5280 Op = peekThroughBitcasts(Op);
5282 auto *Load = dyn_cast<LoadSDNode>(Op);
5286 SDValue Ptr = Load->getBasePtr();
5287 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5288 Ptr->getOpcode() == X86ISD::WrapperRIP)
5289 Ptr = Ptr->getOperand(0);
5291 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5292 if (!CNode || CNode->isMachineConstantPoolEntry())
5295 return dyn_cast<Constant>(CNode->getConstVal());
5298 // Extract raw constant bits from constant pools.
5299 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5301 SmallVectorImpl<APInt> &EltBits,
5302 bool AllowWholeUndefs = true,
5303 bool AllowPartialUndefs = true) {
5304 assert(EltBits.empty() && "Expected an empty EltBits vector");
5306 Op = peekThroughBitcasts(Op);
5308 EVT VT = Op.getValueType();
5309 unsigned SizeInBits = VT.getSizeInBits();
5310 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5311 unsigned NumElts = SizeInBits / EltSizeInBits;
5313 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5314 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5316 // Extract all the undef/constant element data and pack into single bitsets.
5317 APInt UndefBits(SizeInBits, 0);
5318 APInt MaskBits(SizeInBits, 0);
5320 // Split the undef/constant single bitset data into the target elements.
5321 auto SplitBitData = [&]() {
5322 // Don't split if we don't allow undef bits.
5323 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5324 if (UndefBits.getBoolValue() && !AllowUndefs)
5327 UndefElts = APInt(NumElts, 0);
5328 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5330 for (unsigned i = 0; i != NumElts; ++i) {
5331 unsigned BitOffset = i * EltSizeInBits;
5332 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5334 // Only treat an element as UNDEF if all bits are UNDEF.
5335 if (UndefEltBits.isAllOnesValue()) {
5336 if (!AllowWholeUndefs)
5338 UndefElts.setBit(i);
5342 // If only some bits are UNDEF then treat them as zero (or bail if not
5344 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5347 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5348 EltBits[i] = Bits.getZExtValue();
5353 // Collect constant bits and insert into mask/undef bit masks.
5354 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5355 unsigned BitOffset) {
5358 if (isa<UndefValue>(Cst)) {
5359 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5360 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5363 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5364 Mask.insertBits(CInt->getValue(), BitOffset);
5367 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5368 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5374 // Extract constant bits from build vector.
5375 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5376 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5377 const SDValue &Src = Op.getOperand(i);
5378 unsigned BitOffset = i * SrcEltSizeInBits;
5379 if (Src.isUndef()) {
5380 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5383 auto *Cst = cast<ConstantSDNode>(Src);
5384 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5385 MaskBits.insertBits(Bits, BitOffset);
5387 return SplitBitData();
5390 // Extract constant bits from constant pool vector.
5391 if (auto *Cst = getTargetConstantFromNode(Op)) {
5392 Type *CstTy = Cst->getType();
5393 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5396 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5397 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5398 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5399 i * CstEltSizeInBits))
5402 return SplitBitData();
5405 // Extract constant bits from a broadcasted constant pool scalar.
5406 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5407 EltSizeInBits <= SrcEltSizeInBits) {
5408 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5409 APInt Bits(SizeInBits, 0);
5410 APInt Undefs(SizeInBits, 0);
5411 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5412 for (unsigned i = 0; i != NumSrcElts; ++i) {
5413 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5414 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5416 return SplitBitData();
5421 // Extract a rematerialized scalar constant insertion.
5422 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5423 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5424 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5425 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5426 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5427 MaskBits = MaskBits.zext(SizeInBits);
5428 return SplitBitData();
5434 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5435 unsigned MaskEltSizeInBits,
5436 SmallVectorImpl<uint64_t> &RawMask) {
5438 SmallVector<APInt, 64> EltBits;
5440 // Extract the raw target constant bits.
5441 // FIXME: We currently don't support UNDEF bits or mask entries.
5442 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5443 EltBits, /* AllowWholeUndefs */ false,
5444 /* AllowPartialUndefs */ false))
5447 // Insert the extracted elements into the mask.
5448 for (APInt Elt : EltBits)
5449 RawMask.push_back(Elt.getZExtValue());
5454 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5455 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5456 /// operands in \p Ops, and returns true.
5457 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5458 /// IsUnary for shuffles which use a single input multiple times, and in those
5459 /// cases it will adjust the mask to only have indices within that single input.
5460 /// It is an error to call this with non-empty Mask/Ops vectors.
5461 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5462 SmallVectorImpl<SDValue> &Ops,
5463 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5464 unsigned NumElems = VT.getVectorNumElements();
5467 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5468 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5471 bool IsFakeUnary = false;
5472 switch(N->getOpcode()) {
5473 case X86ISD::BLENDI:
5474 ImmN = N->getOperand(N->getNumOperands()-1);
5475 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5476 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5479 ImmN = N->getOperand(N->getNumOperands()-1);
5480 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5481 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5483 case X86ISD::INSERTPS:
5484 ImmN = N->getOperand(N->getNumOperands()-1);
5485 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5486 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5488 case X86ISD::UNPCKH:
5489 DecodeUNPCKHMask(VT, Mask);
5490 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5492 case X86ISD::UNPCKL:
5493 DecodeUNPCKLMask(VT, Mask);
5494 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5496 case X86ISD::MOVHLPS:
5497 DecodeMOVHLPSMask(NumElems, Mask);
5498 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5500 case X86ISD::MOVLHPS:
5501 DecodeMOVLHPSMask(NumElems, Mask);
5502 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5504 case X86ISD::PALIGNR:
5505 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5506 ImmN = N->getOperand(N->getNumOperands()-1);
5507 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5508 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5509 Ops.push_back(N->getOperand(1));
5510 Ops.push_back(N->getOperand(0));
5512 case X86ISD::VSHLDQ:
5513 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5514 ImmN = N->getOperand(N->getNumOperands() - 1);
5515 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5518 case X86ISD::VSRLDQ:
5519 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5520 ImmN = N->getOperand(N->getNumOperands() - 1);
5521 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5524 case X86ISD::PSHUFD:
5525 case X86ISD::VPERMILPI:
5526 ImmN = N->getOperand(N->getNumOperands()-1);
5527 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5530 case X86ISD::PSHUFHW:
5531 ImmN = N->getOperand(N->getNumOperands()-1);
5532 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5535 case X86ISD::PSHUFLW:
5536 ImmN = N->getOperand(N->getNumOperands()-1);
5537 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5540 case X86ISD::VZEXT_MOVL:
5541 DecodeZeroMoveLowMask(VT, Mask);
5544 case X86ISD::VBROADCAST: {
5545 SDValue N0 = N->getOperand(0);
5546 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5547 // add the pre-extracted value to the Ops vector.
5548 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5549 N0.getOperand(0).getValueType() == VT &&
5550 N0.getConstantOperandVal(1) == 0)
5551 Ops.push_back(N0.getOperand(0));
5553 // We only decode broadcasts of same-sized vectors, unless the broadcast
5554 // came from an extract from the original width. If we found one, we
5555 // pushed it the Ops vector above.
5556 if (N0.getValueType() == VT || !Ops.empty()) {
5557 DecodeVectorBroadcast(VT, Mask);
5563 case X86ISD::VPERMILPV: {
5565 SDValue MaskNode = N->getOperand(1);
5566 unsigned MaskEltSize = VT.getScalarSizeInBits();
5567 SmallVector<uint64_t, 32> RawMask;
5568 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5569 DecodeVPERMILPMask(VT, RawMask, Mask);
5572 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5573 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5578 case X86ISD::PSHUFB: {
5580 SDValue MaskNode = N->getOperand(1);
5581 SmallVector<uint64_t, 32> RawMask;
5582 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5583 DecodePSHUFBMask(RawMask, Mask);
5586 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5587 DecodePSHUFBMask(C, Mask);
5592 case X86ISD::VPERMI:
5593 ImmN = N->getOperand(N->getNumOperands()-1);
5594 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5599 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5601 case X86ISD::VPERM2X128:
5602 ImmN = N->getOperand(N->getNumOperands()-1);
5603 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5604 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5606 case X86ISD::MOVSLDUP:
5607 DecodeMOVSLDUPMask(VT, Mask);
5610 case X86ISD::MOVSHDUP:
5611 DecodeMOVSHDUPMask(VT, Mask);
5614 case X86ISD::MOVDDUP:
5615 DecodeMOVDDUPMask(VT, Mask);
5618 case X86ISD::MOVLHPD:
5619 case X86ISD::MOVLPD:
5620 case X86ISD::MOVLPS:
5621 // Not yet implemented
5623 case X86ISD::VPERMIL2: {
5624 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5625 unsigned MaskEltSize = VT.getScalarSizeInBits();
5626 SDValue MaskNode = N->getOperand(2);
5627 SDValue CtrlNode = N->getOperand(3);
5628 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5629 unsigned CtrlImm = CtrlOp->getZExtValue();
5630 SmallVector<uint64_t, 32> RawMask;
5631 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5632 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5635 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5636 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5642 case X86ISD::VPPERM: {
5643 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5644 SDValue MaskNode = N->getOperand(2);
5645 SmallVector<uint64_t, 32> RawMask;
5646 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5647 DecodeVPPERMMask(RawMask, Mask);
5650 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5651 DecodeVPPERMMask(C, Mask);
5656 case X86ISD::VPERMV: {
5658 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5659 Ops.push_back(N->getOperand(1));
5660 SDValue MaskNode = N->getOperand(0);
5661 SmallVector<uint64_t, 32> RawMask;
5662 unsigned MaskEltSize = VT.getScalarSizeInBits();
5663 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5664 DecodeVPERMVMask(RawMask, Mask);
5667 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5668 DecodeVPERMVMask(C, MaskEltSize, Mask);
5673 case X86ISD::VPERMV3: {
5674 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5675 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5676 Ops.push_back(N->getOperand(0));
5677 Ops.push_back(N->getOperand(2));
5678 SDValue MaskNode = N->getOperand(1);
5679 unsigned MaskEltSize = VT.getScalarSizeInBits();
5680 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5681 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5686 case X86ISD::VPERMIV3: {
5687 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5688 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5689 Ops.push_back(N->getOperand(1));
5690 Ops.push_back(N->getOperand(2));
5691 SDValue MaskNode = N->getOperand(0);
5692 unsigned MaskEltSize = VT.getScalarSizeInBits();
5693 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5694 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5699 default: llvm_unreachable("unknown target shuffle node");
5702 // Empty mask indicates the decode failed.
5706 // Check if we're getting a shuffle mask with zero'd elements.
5707 if (!AllowSentinelZero)
5708 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5711 // If we have a fake unary shuffle, the shuffle mask is spread across two
5712 // inputs that are actually the same node. Re-map the mask to always point
5713 // into the first input.
5716 if (M >= (int)Mask.size())
5719 // If we didn't already add operands in the opcode-specific code, default to
5720 // adding 1 or 2 operands starting at 0.
5722 Ops.push_back(N->getOperand(0));
5723 if (!IsUnary || IsFakeUnary)
5724 Ops.push_back(N->getOperand(1));
5730 /// Check a target shuffle mask's inputs to see if we can set any values to
5731 /// SM_SentinelZero - this is for elements that are known to be zero
5732 /// (not just zeroable) from their inputs.
5733 /// Returns true if the target shuffle mask was decoded.
5734 static bool setTargetShuffleZeroElements(SDValue N,
5735 SmallVectorImpl<int> &Mask,
5736 SmallVectorImpl<SDValue> &Ops) {
5738 if (!isTargetShuffle(N.getOpcode()))
5741 MVT VT = N.getSimpleValueType();
5742 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5745 SDValue V1 = Ops[0];
5746 SDValue V2 = IsUnary ? V1 : Ops[1];
5748 V1 = peekThroughBitcasts(V1);
5749 V2 = peekThroughBitcasts(V2);
5751 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5752 "Illegal split of shuffle value type");
5753 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5755 // Extract known constant input data.
5756 APInt UndefSrcElts[2];
5757 SmallVector<APInt, 32> SrcEltBits[2];
5758 bool IsSrcConstant[2] = {
5759 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5760 SrcEltBits[0], true, false),
5761 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5762 SrcEltBits[1], true, false)};
5764 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5767 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5771 // Determine shuffle input and normalize the mask.
5772 unsigned SrcIdx = M / Size;
5773 SDValue V = M < Size ? V1 : V2;
5776 // We are referencing an UNDEF input.
5778 Mask[i] = SM_SentinelUndef;
5782 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5783 // TODO: We currently only set UNDEF for integer types - floats use the same
5784 // registers as vectors and many of the scalar folded loads rely on the
5785 // SCALAR_TO_VECTOR pattern.
5786 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5787 (Size % V.getValueType().getVectorNumElements()) == 0) {
5788 int Scale = Size / V.getValueType().getVectorNumElements();
5789 int Idx = M / Scale;
5790 if (Idx != 0 && !VT.isFloatingPoint())
5791 Mask[i] = SM_SentinelUndef;
5792 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5793 Mask[i] = SM_SentinelZero;
5797 // Attempt to extract from the source's constant bits.
5798 if (IsSrcConstant[SrcIdx]) {
5799 if (UndefSrcElts[SrcIdx][M])
5800 Mask[i] = SM_SentinelUndef;
5801 else if (SrcEltBits[SrcIdx][M] == 0)
5802 Mask[i] = SM_SentinelZero;
5806 assert(VT.getVectorNumElements() == Mask.size() &&
5807 "Different mask size from vector size!");
5811 // Attempt to decode ops that could be represented as a shuffle mask.
5812 // The decoded shuffle mask may contain a different number of elements to the
5813 // destination value type.
5814 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5815 SmallVectorImpl<SDValue> &Ops) {
5819 MVT VT = N.getSimpleValueType();
5820 unsigned NumElts = VT.getVectorNumElements();
5821 unsigned NumSizeInBits = VT.getSizeInBits();
5822 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5823 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5824 "Expected byte aligned value types");
5826 unsigned Opcode = N.getOpcode();
5829 case X86ISD::ANDNP: {
5830 // Attempt to decode as a per-byte mask.
5832 SmallVector<APInt, 32> EltBits;
5833 SDValue N0 = N.getOperand(0);
5834 SDValue N1 = N.getOperand(1);
5835 bool IsAndN = (X86ISD::ANDNP == Opcode);
5836 uint64_t ZeroMask = IsAndN ? 255 : 0;
5837 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5839 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5841 Mask.push_back(SM_SentinelUndef);
5844 uint64_t ByteBits = EltBits[i].getZExtValue();
5845 if (ByteBits != 0 && ByteBits != 255)
5847 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5849 Ops.push_back(IsAndN ? N1 : N0);
5852 case ISD::SCALAR_TO_VECTOR: {
5853 // Match against a scalar_to_vector of an extract from a similar vector.
5854 SDValue N0 = N.getOperand(0);
5855 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5856 N0.getOperand(0).getValueType() != VT ||
5857 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5858 NumElts <= N0.getConstantOperandVal(1) ||
5859 !N->isOnlyUserOf(N0.getNode()))
5861 Ops.push_back(N0.getOperand(0));
5862 Mask.push_back(N0.getConstantOperandVal(1));
5863 Mask.append(NumElts - 1, SM_SentinelUndef);
5866 case X86ISD::PINSRB:
5867 case X86ISD::PINSRW: {
5868 SDValue InVec = N.getOperand(0);
5869 SDValue InScl = N.getOperand(1);
5870 uint64_t InIdx = N.getConstantOperandVal(2);
5871 assert(InIdx < NumElts && "Illegal insertion index");
5873 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5874 if (X86::isZeroNode(InScl)) {
5875 Ops.push_back(InVec);
5876 for (unsigned i = 0; i != NumElts; ++i)
5877 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5881 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5882 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5884 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5885 if (InScl.getOpcode() != ISD::AssertZext ||
5886 InScl.getOperand(0).getOpcode() != ExOp)
5889 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5890 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5891 assert(ExIdx < NumElts && "Illegal extraction index");
5892 Ops.push_back(InVec);
5893 Ops.push_back(ExVec);
5894 for (unsigned i = 0; i != NumElts; ++i)
5895 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5899 case X86ISD::VSRLI: {
5900 uint64_t ShiftVal = N.getConstantOperandVal(1);
5901 // Out of range bit shifts are guaranteed to be zero.
5902 if (NumBitsPerElt <= ShiftVal) {
5903 Mask.append(NumElts, SM_SentinelZero);
5907 // We can only decode 'whole byte' bit shifts as shuffles.
5908 if ((ShiftVal % 8) != 0)
5911 uint64_t ByteShift = ShiftVal / 8;
5912 unsigned NumBytes = NumSizeInBits / 8;
5913 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5914 Ops.push_back(N.getOperand(0));
5916 // Clear mask to all zeros and insert the shifted byte indices.
5917 Mask.append(NumBytes, SM_SentinelZero);
5919 if (X86ISD::VSHLI == Opcode) {
5920 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5921 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5922 Mask[i + j] = i + j - ByteShift;
5924 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5925 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5926 Mask[i + j - ByteShift] = i + j;
5930 case ISD::ZERO_EXTEND_VECTOR_INREG:
5931 case X86ISD::VZEXT: {
5932 // TODO - add support for VPMOVZX with smaller input vector types.
5933 SDValue Src = N.getOperand(0);
5934 MVT SrcVT = Src.getSimpleValueType();
5935 if (NumSizeInBits != SrcVT.getSizeInBits())
5937 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5946 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5947 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5948 SmallVectorImpl<int> &Mask) {
5949 int MaskWidth = Mask.size();
5950 SmallVector<SDValue, 16> UsedInputs;
5951 for (int i = 0, e = Inputs.size(); i < e; ++i) {
5952 int lo = UsedInputs.size() * MaskWidth;
5953 int hi = lo + MaskWidth;
5954 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
5955 UsedInputs.push_back(Inputs[i]);
5962 Inputs = UsedInputs;
5965 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5966 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5967 /// remaining input indices in case we now have a unary shuffle and adjust the
5968 /// inputs accordingly.
5969 /// Returns true if the target shuffle mask was decoded.
5970 static bool resolveTargetShuffleInputs(SDValue Op,
5971 SmallVectorImpl<SDValue> &Inputs,
5972 SmallVectorImpl<int> &Mask) {
5973 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
5974 if (!getFauxShuffleMask(Op, Mask, Inputs))
5977 resolveTargetShuffleInputsAndMask(Inputs, Mask);
5981 /// Returns the scalar element that will make up the ith
5982 /// element of the result of the vector shuffle.
5983 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5986 return SDValue(); // Limit search depth.
5988 SDValue V = SDValue(N, 0);
5989 EVT VT = V.getValueType();
5990 unsigned Opcode = V.getOpcode();
5992 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5993 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5994 int Elt = SV->getMaskElt(Index);
5997 return DAG.getUNDEF(VT.getVectorElementType());
5999 unsigned NumElems = VT.getVectorNumElements();
6000 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6001 : SV->getOperand(1);
6002 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6005 // Recurse into target specific vector shuffles to find scalars.
6006 if (isTargetShuffle(Opcode)) {
6007 MVT ShufVT = V.getSimpleValueType();
6008 MVT ShufSVT = ShufVT.getVectorElementType();
6009 int NumElems = (int)ShufVT.getVectorNumElements();
6010 SmallVector<int, 16> ShuffleMask;
6011 SmallVector<SDValue, 16> ShuffleOps;
6014 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6017 int Elt = ShuffleMask[Index];
6018 if (Elt == SM_SentinelZero)
6019 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6020 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6021 if (Elt == SM_SentinelUndef)
6022 return DAG.getUNDEF(ShufSVT);
6024 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6025 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6026 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6030 // Actual nodes that may contain scalar elements
6031 if (Opcode == ISD::BITCAST) {
6032 V = V.getOperand(0);
6033 EVT SrcVT = V.getValueType();
6034 unsigned NumElems = VT.getVectorNumElements();
6036 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6040 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6041 return (Index == 0) ? V.getOperand(0)
6042 : DAG.getUNDEF(VT.getVectorElementType());
6044 if (V.getOpcode() == ISD::BUILD_VECTOR)
6045 return V.getOperand(Index);
6050 /// Custom lower build_vector of v16i8.
6051 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6052 unsigned NumNonZero, unsigned NumZero,
6054 const X86Subtarget &Subtarget) {
6055 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6062 // SSE4.1 - use PINSRB to insert each byte directly.
6063 if (Subtarget.hasSSE41()) {
6064 for (unsigned i = 0; i < 16; ++i) {
6065 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6067 // If the build vector contains zeros or our first insertion is not the
6068 // first index then insert into zero vector to break any register
6069 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6072 if (NumZero || 0 != i)
6073 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6075 assert(0 == i && "Expected insertion into zero-index");
6076 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6077 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6078 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6079 V = DAG.getBitcast(MVT::v16i8, V);
6083 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6084 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6091 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6092 for (unsigned i = 0; i < 16; ++i) {
6093 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6094 if (ThisIsNonZero && First) {
6096 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6098 V = DAG.getUNDEF(MVT::v8i16);
6103 // FIXME: Investigate extending to i32 instead of just i16.
6104 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6105 SDValue ThisElt, LastElt;
6106 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6107 if (LastIsNonZero) {
6109 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6111 if (ThisIsNonZero) {
6112 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6113 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6114 DAG.getConstant(8, dl, MVT::i8));
6116 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6122 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6123 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6124 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6125 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6126 V = DAG.getBitcast(MVT::v8i16, V);
6128 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6129 DAG.getIntPtrConstant(i / 2, dl));
6135 return DAG.getBitcast(MVT::v16i8, V);
6138 /// Custom lower build_vector of v8i16.
6139 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6140 unsigned NumNonZero, unsigned NumZero,
6142 const X86Subtarget &Subtarget) {
6143 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6149 for (unsigned i = 0; i < 8; ++i) {
6150 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6152 // If the build vector contains zeros or our first insertion is not the
6153 // first index then insert into zero vector to break any register
6154 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6157 if (NumZero || 0 != i)
6158 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6160 assert(0 == i && "Expected insertion into zero-index");
6161 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6162 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6163 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6164 V = DAG.getBitcast(MVT::v8i16, V);
6168 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6169 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6176 /// Custom lower build_vector of v4i32 or v4f32.
6177 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6178 const X86Subtarget &Subtarget) {
6179 // Find all zeroable elements.
6180 std::bitset<4> Zeroable;
6181 for (int i=0; i < 4; ++i) {
6182 SDValue Elt = Op->getOperand(i);
6183 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6185 assert(Zeroable.size() - Zeroable.count() > 1 &&
6186 "We expect at least two non-zero elements!");
6188 // We only know how to deal with build_vector nodes where elements are either
6189 // zeroable or extract_vector_elt with constant index.
6190 SDValue FirstNonZero;
6191 unsigned FirstNonZeroIdx;
6192 for (unsigned i=0; i < 4; ++i) {
6195 SDValue Elt = Op->getOperand(i);
6196 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6197 !isa<ConstantSDNode>(Elt.getOperand(1)))
6199 // Make sure that this node is extracting from a 128-bit vector.
6200 MVT VT = Elt.getOperand(0).getSimpleValueType();
6201 if (!VT.is128BitVector())
6203 if (!FirstNonZero.getNode()) {
6205 FirstNonZeroIdx = i;
6209 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6210 SDValue V1 = FirstNonZero.getOperand(0);
6211 MVT VT = V1.getSimpleValueType();
6213 // See if this build_vector can be lowered as a blend with zero.
6215 unsigned EltMaskIdx, EltIdx;
6217 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6218 if (Zeroable[EltIdx]) {
6219 // The zero vector will be on the right hand side.
6220 Mask[EltIdx] = EltIdx+4;
6224 Elt = Op->getOperand(EltIdx);
6225 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6226 EltMaskIdx = Elt.getConstantOperandVal(1);
6227 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6229 Mask[EltIdx] = EltIdx;
6233 // Let the shuffle legalizer deal with blend operations.
6234 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6235 if (V1.getSimpleValueType() != VT)
6236 V1 = DAG.getBitcast(VT, V1);
6237 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6240 // See if we can lower this build_vector to a INSERTPS.
6241 if (!Subtarget.hasSSE41())
6244 SDValue V2 = Elt.getOperand(0);
6245 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6248 bool CanFold = true;
6249 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6253 SDValue Current = Op->getOperand(i);
6254 SDValue SrcVector = Current->getOperand(0);
6257 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6263 assert(V1.getNode() && "Expected at least two non-zero elements!");
6264 if (V1.getSimpleValueType() != MVT::v4f32)
6265 V1 = DAG.getBitcast(MVT::v4f32, V1);
6266 if (V2.getSimpleValueType() != MVT::v4f32)
6267 V2 = DAG.getBitcast(MVT::v4f32, V2);
6269 // Ok, we can emit an INSERTPS instruction.
6270 unsigned ZMask = Zeroable.to_ulong();
6272 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6273 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6275 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6276 DAG.getIntPtrConstant(InsertPSMask, DL));
6277 return DAG.getBitcast(VT, Result);
6280 /// Return a vector logical shift node.
6281 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6282 SelectionDAG &DAG, const TargetLowering &TLI,
6284 assert(VT.is128BitVector() && "Unknown type for VShift");
6285 MVT ShVT = MVT::v16i8;
6286 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6287 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6288 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6289 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6290 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6291 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6294 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6295 SelectionDAG &DAG) {
6297 // Check if the scalar load can be widened into a vector load. And if
6298 // the address is "base + cst" see if the cst can be "absorbed" into
6299 // the shuffle mask.
6300 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6301 SDValue Ptr = LD->getBasePtr();
6302 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6304 EVT PVT = LD->getValueType(0);
6305 if (PVT != MVT::i32 && PVT != MVT::f32)
6310 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6311 FI = FINode->getIndex();
6313 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6314 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6315 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6316 Offset = Ptr.getConstantOperandVal(1);
6317 Ptr = Ptr.getOperand(0);
6322 // FIXME: 256-bit vector instructions don't require a strict alignment,
6323 // improve this code to support it better.
6324 unsigned RequiredAlign = VT.getSizeInBits()/8;
6325 SDValue Chain = LD->getChain();
6326 // Make sure the stack object alignment is at least 16 or 32.
6327 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6328 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6329 if (MFI.isFixedObjectIndex(FI)) {
6330 // Can't change the alignment. FIXME: It's possible to compute
6331 // the exact stack offset and reference FI + adjust offset instead.
6332 // If someone *really* cares about this. That's the way to implement it.
6335 MFI.setObjectAlignment(FI, RequiredAlign);
6339 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6340 // Ptr + (Offset & ~15).
6343 if ((Offset % RequiredAlign) & 3)
6345 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6348 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6349 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6352 int EltNo = (Offset - StartOffset) >> 2;
6353 unsigned NumElems = VT.getVectorNumElements();
6355 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6356 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6357 LD->getPointerInfo().getWithOffset(StartOffset));
6359 SmallVector<int, 8> Mask(NumElems, EltNo);
6361 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6367 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6368 /// elements can be replaced by a single large load which has the same value as
6369 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6371 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6372 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6373 const SDLoc &DL, SelectionDAG &DAG,
6374 bool isAfterLegalize) {
6375 unsigned NumElems = Elts.size();
6377 int LastLoadedElt = -1;
6378 SmallBitVector LoadMask(NumElems, false);
6379 SmallBitVector ZeroMask(NumElems, false);
6380 SmallBitVector UndefMask(NumElems, false);
6382 // For each element in the initializer, see if we've found a load, zero or an
6384 for (unsigned i = 0; i < NumElems; ++i) {
6385 SDValue Elt = peekThroughBitcasts(Elts[i]);
6390 UndefMask[i] = true;
6391 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6393 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6396 // Each loaded element must be the correct fractional portion of the
6397 // requested vector load.
6398 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6403 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6404 "Incomplete element masks");
6406 // Handle Special Cases - all undef or undef/zero.
6407 if (UndefMask.count() == NumElems)
6408 return DAG.getUNDEF(VT);
6410 // FIXME: Should we return this as a BUILD_VECTOR instead?
6411 if ((ZeroMask | UndefMask).count() == NumElems)
6412 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6413 : DAG.getConstantFP(0.0, DL, VT);
6415 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6416 int FirstLoadedElt = LoadMask.find_first();
6417 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6418 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6419 EVT LDBaseVT = EltBase.getValueType();
6421 // Consecutive loads can contain UNDEFS but not ZERO elements.
6422 // Consecutive loads with UNDEFs and ZEROs elements require a
6423 // an additional shuffle stage to clear the ZERO elements.
6424 bool IsConsecutiveLoad = true;
6425 bool IsConsecutiveLoadWithZeros = true;
6426 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6428 SDValue Elt = peekThroughBitcasts(Elts[i]);
6429 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6430 if (!DAG.areNonVolatileConsecutiveLoads(
6431 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6432 i - FirstLoadedElt)) {
6433 IsConsecutiveLoad = false;
6434 IsConsecutiveLoadWithZeros = false;
6437 } else if (ZeroMask[i]) {
6438 IsConsecutiveLoad = false;
6442 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6443 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6444 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6445 "Cannot merge volatile loads.");
6447 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6448 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6450 if (LDBase->hasAnyUseOfValue(1)) {
6452 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6453 SDValue(NewLd.getNode(), 1));
6454 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6455 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6456 SDValue(NewLd.getNode(), 1));
6462 // LOAD - all consecutive load/undefs (must start/end with a load).
6463 // If we have found an entire vector of loads and undefs, then return a large
6464 // load of the entire vector width starting at the base pointer.
6465 // If the vector contains zeros, then attempt to shuffle those elements.
6466 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6467 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6468 assert(LDBase && "Did not find base load for merging consecutive loads");
6469 EVT EltVT = LDBase->getValueType(0);
6470 // Ensure that the input vector size for the merged loads matches the
6471 // cumulative size of the input elements.
6472 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6475 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6478 if (IsConsecutiveLoad)
6479 return CreateLoad(VT, LDBase);
6481 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6482 // vector and a zero vector to clear out the zero elements.
6483 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6484 SmallVector<int, 4> ClearMask(NumElems, -1);
6485 for (unsigned i = 0; i < NumElems; ++i) {
6487 ClearMask[i] = i + NumElems;
6488 else if (LoadMask[i])
6491 SDValue V = CreateLoad(VT, LDBase);
6492 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6493 : DAG.getConstantFP(0.0, DL, VT);
6494 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6499 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6501 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6502 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6503 (LoadSize == 32 || LoadSize == 64) &&
6504 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6505 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6506 : MVT::getIntegerVT(LoadSize);
6507 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6508 if (TLI.isTypeLegal(VecVT)) {
6509 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6510 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6512 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6513 LDBase->getPointerInfo(),
6514 LDBase->getAlignment(),
6515 false/*isVolatile*/, true/*ReadMem*/,
6518 // Make sure the newly-created LOAD is in the same position as LDBase in
6519 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6520 // and update uses of LDBase's output chain to use the TokenFactor.
6521 if (LDBase->hasAnyUseOfValue(1)) {
6523 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6524 SDValue(ResNode.getNode(), 1));
6525 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6526 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6527 SDValue(ResNode.getNode(), 1));
6530 return DAG.getBitcast(VT, ResNode);
6537 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6538 unsigned SplatBitSize, LLVMContext &C) {
6539 unsigned ScalarSize = VT.getScalarSizeInBits();
6540 unsigned NumElm = SplatBitSize / ScalarSize;
6542 SmallVector<Constant *, 32> ConstantVec;
6543 for (unsigned i = 0; i < NumElm; i++) {
6544 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6546 if (VT.isFloatingPoint()) {
6547 assert((ScalarSize == 32 || ScalarSize == 64) &&
6548 "Unsupported floating point scalar size");
6549 if (ScalarSize == 32)
6550 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6552 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6554 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6555 ConstantVec.push_back(Const);
6557 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6560 static bool isUseOfShuffle(SDNode *N) {
6561 for (auto *U : N->uses()) {
6562 if (isTargetShuffle(U->getOpcode()))
6564 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6565 return isUseOfShuffle(U);
6570 /// Attempt to use the vbroadcast instruction to generate a splat value
6571 /// from a splat BUILD_VECTOR which uses:
6572 /// a. A single scalar load, or a constant.
6573 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6575 /// The VBROADCAST node is returned when a pattern is found,
6576 /// or SDValue() otherwise.
6577 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6578 const X86Subtarget &Subtarget,
6579 SelectionDAG &DAG) {
6580 // VBROADCAST requires AVX.
6581 // TODO: Splats could be generated for non-AVX CPUs using SSE
6582 // instructions, but there's less potential gain for only 128-bit vectors.
6583 if (!Subtarget.hasAVX())
6586 MVT VT = BVOp->getSimpleValueType(0);
6589 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6590 "Unsupported vector type for broadcast.");
6592 BitVector UndefElements;
6593 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6595 // We need a splat of a single value to use broadcast, and it doesn't
6596 // make any sense if the value is only in one element of the vector.
6597 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6598 APInt SplatValue, Undef;
6599 unsigned SplatBitSize;
6601 // Check if this is a repeated constant pattern suitable for broadcasting.
6602 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6603 SplatBitSize > VT.getScalarSizeInBits() &&
6604 SplatBitSize < VT.getSizeInBits()) {
6605 // Avoid replacing with broadcast when it's a use of a shuffle
6606 // instruction to preserve the present custom lowering of shuffles.
6607 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6609 // replace BUILD_VECTOR with broadcast of the repeated constants.
6610 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6611 LLVMContext *Ctx = DAG.getContext();
6612 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6613 if (Subtarget.hasAVX()) {
6614 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6615 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6616 // Splatted value can fit in one INTEGER constant in constant pool.
6617 // Load the constant and broadcast it.
6618 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6619 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6620 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6621 SDValue CP = DAG.getConstantPool(C, PVT);
6622 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6624 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6626 CVT, dl, DAG.getEntryNode(), CP,
6627 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6629 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6630 MVT::getVectorVT(CVT, Repeat), Ld);
6631 return DAG.getBitcast(VT, Brdcst);
6632 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6633 // Splatted value can fit in one FLOAT constant in constant pool.
6634 // Load the constant and broadcast it.
6635 // AVX have support for 32 and 64 bit broadcast for floats only.
6636 // No 64bit integer in 32bit subtarget.
6637 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6638 Constant *C = SplatBitSize == 32
6639 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6640 SplatValue.bitsToFloat())
6641 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6642 SplatValue.bitsToDouble());
6643 SDValue CP = DAG.getConstantPool(C, PVT);
6644 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6646 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6648 CVT, dl, DAG.getEntryNode(), CP,
6649 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6651 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6652 MVT::getVectorVT(CVT, Repeat), Ld);
6653 return DAG.getBitcast(VT, Brdcst);
6654 } else if (SplatBitSize > 64) {
6655 // Load the vector of constants and broadcast it.
6656 MVT CVT = VT.getScalarType();
6657 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6659 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6660 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6661 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6663 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6664 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6666 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6667 return DAG.getBitcast(VT, Brdcst);
6674 bool ConstSplatVal =
6675 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6677 // Make sure that all of the users of a non-constant load are from the
6678 // BUILD_VECTOR node.
6679 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6682 unsigned ScalarSize = Ld.getValueSizeInBits();
6683 bool IsGE256 = (VT.getSizeInBits() >= 256);
6685 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6686 // instruction to save 8 or more bytes of constant pool data.
6687 // TODO: If multiple splats are generated to load the same constant,
6688 // it may be detrimental to overall size. There needs to be a way to detect
6689 // that condition to know if this is truly a size win.
6690 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6692 // Handle broadcasting a single constant scalar from the constant pool
6694 // On Sandybridge (no AVX2), it is still better to load a constant vector
6695 // from the constant pool and not to broadcast it from a scalar.
6696 // But override that restriction when optimizing for size.
6697 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6698 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6699 EVT CVT = Ld.getValueType();
6700 assert(!CVT.isVector() && "Must not broadcast a vector type");
6702 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6703 // For size optimization, also splat v2f64 and v2i64, and for size opt
6704 // with AVX2, also splat i8 and i16.
6705 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6706 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6707 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6708 const Constant *C = nullptr;
6709 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6710 C = CI->getConstantIntValue();
6711 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6712 C = CF->getConstantFPValue();
6714 assert(C && "Invalid constant type");
6716 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6718 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6719 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6721 CVT, dl, DAG.getEntryNode(), CP,
6722 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6725 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6729 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6731 // Handle AVX2 in-register broadcasts.
6732 if (!IsLoad && Subtarget.hasInt256() &&
6733 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6734 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6736 // The scalar source must be a normal load.
6740 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6741 (Subtarget.hasVLX() && ScalarSize == 64))
6742 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6744 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6745 // double since there is no vbroadcastsd xmm
6746 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6747 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6748 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6751 // Unsupported broadcast.
6755 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6756 /// underlying vector and index.
6758 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6760 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6762 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6763 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6766 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6768 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6770 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6771 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6774 // In this case the vector is the extract_subvector expression and the index
6775 // is 2, as specified by the shuffle.
6776 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6777 SDValue ShuffleVec = SVOp->getOperand(0);
6778 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6779 assert(ShuffleVecVT.getVectorElementType() ==
6780 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6782 int ShuffleIdx = SVOp->getMaskElt(Idx);
6783 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6784 ExtractedFromVec = ShuffleVec;
6790 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6791 MVT VT = Op.getSimpleValueType();
6793 // Skip if insert_vec_elt is not supported.
6794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6795 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6799 unsigned NumElems = Op.getNumOperands();
6803 SmallVector<unsigned, 4> InsertIndices;
6804 SmallVector<int, 8> Mask(NumElems, -1);
6806 for (unsigned i = 0; i != NumElems; ++i) {
6807 unsigned Opc = Op.getOperand(i).getOpcode();
6809 if (Opc == ISD::UNDEF)
6812 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6813 // Quit if more than 1 elements need inserting.
6814 if (InsertIndices.size() > 1)
6817 InsertIndices.push_back(i);
6821 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6822 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6824 // Quit if non-constant index.
6825 if (!isa<ConstantSDNode>(ExtIdx))
6827 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6829 // Quit if extracted from vector of different type.
6830 if (ExtractedFromVec.getValueType() != VT)
6833 if (!VecIn1.getNode())
6834 VecIn1 = ExtractedFromVec;
6835 else if (VecIn1 != ExtractedFromVec) {
6836 if (!VecIn2.getNode())
6837 VecIn2 = ExtractedFromVec;
6838 else if (VecIn2 != ExtractedFromVec)
6839 // Quit if more than 2 vectors to shuffle
6843 if (ExtractedFromVec == VecIn1)
6845 else if (ExtractedFromVec == VecIn2)
6846 Mask[i] = Idx + NumElems;
6849 if (!VecIn1.getNode())
6852 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6853 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6855 for (unsigned Idx : InsertIndices)
6856 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6857 DAG.getIntPtrConstant(Idx, DL));
6862 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6863 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6864 Op.getScalarValueSizeInBits() == 1 &&
6865 "Can not convert non-constant vector");
6866 uint64_t Immediate = 0;
6867 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6868 SDValue In = Op.getOperand(idx);
6870 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6873 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6874 return DAG.getConstant(Immediate, dl, VT);
6876 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6878 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6880 MVT VT = Op.getSimpleValueType();
6881 assert((VT.getVectorElementType() == MVT::i1) &&
6882 "Unexpected type in LowerBUILD_VECTORvXi1!");
6885 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6886 return DAG.getTargetConstant(0, dl, VT);
6888 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6889 return DAG.getTargetConstant(1, dl, VT);
6891 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6892 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6893 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6894 return DAG.getBitcast(VT, Imm);
6895 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6896 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6897 DAG.getIntPtrConstant(0, dl));
6900 // Vector has one or more non-const elements
6901 uint64_t Immediate = 0;
6902 SmallVector<unsigned, 16> NonConstIdx;
6903 bool IsSplat = true;
6904 bool HasConstElts = false;
6906 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6907 SDValue In = Op.getOperand(idx);
6910 if (!isa<ConstantSDNode>(In))
6911 NonConstIdx.push_back(idx);
6913 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6914 HasConstElts = true;
6918 else if (In != Op.getOperand(SplatIdx))
6922 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6924 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
6925 DAG.getConstant(1, dl, VT),
6926 DAG.getConstant(0, dl, VT));
6928 // insert elements one by one
6932 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6933 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6935 else if (HasConstElts)
6936 Imm = DAG.getConstant(0, dl, VT);
6938 Imm = DAG.getUNDEF(VT);
6939 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6940 DstVec = DAG.getBitcast(VT, Imm);
6942 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6943 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6944 DAG.getIntPtrConstant(0, dl));
6947 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6948 unsigned InsertIdx = NonConstIdx[i];
6949 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6950 Op.getOperand(InsertIdx),
6951 DAG.getIntPtrConstant(InsertIdx, dl));
6956 /// \brief Return true if \p N implements a horizontal binop and return the
6957 /// operands for the horizontal binop into V0 and V1.
6959 /// This is a helper function of LowerToHorizontalOp().
6960 /// This function checks that the build_vector \p N in input implements a
6961 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6962 /// operation to match.
6963 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6964 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6965 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6968 /// This function only analyzes elements of \p N whose indices are
6969 /// in range [BaseIdx, LastIdx).
6970 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6972 unsigned BaseIdx, unsigned LastIdx,
6973 SDValue &V0, SDValue &V1) {
6974 EVT VT = N->getValueType(0);
6976 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6977 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6978 "Invalid Vector in input!");
6980 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6981 bool CanFold = true;
6982 unsigned ExpectedVExtractIdx = BaseIdx;
6983 unsigned NumElts = LastIdx - BaseIdx;
6984 V0 = DAG.getUNDEF(VT);
6985 V1 = DAG.getUNDEF(VT);
6987 // Check if N implements a horizontal binop.
6988 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6989 SDValue Op = N->getOperand(i + BaseIdx);
6992 if (Op->isUndef()) {
6993 // Update the expected vector extract index.
6994 if (i * 2 == NumElts)
6995 ExpectedVExtractIdx = BaseIdx;
6996 ExpectedVExtractIdx += 2;
7000 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7005 SDValue Op0 = Op.getOperand(0);
7006 SDValue Op1 = Op.getOperand(1);
7008 // Try to match the following pattern:
7009 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7010 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7011 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7012 Op0.getOperand(0) == Op1.getOperand(0) &&
7013 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7014 isa<ConstantSDNode>(Op1.getOperand(1)));
7018 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7019 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7021 if (i * 2 < NumElts) {
7023 V0 = Op0.getOperand(0);
7024 if (V0.getValueType() != VT)
7029 V1 = Op0.getOperand(0);
7030 if (V1.getValueType() != VT)
7033 if (i * 2 == NumElts)
7034 ExpectedVExtractIdx = BaseIdx;
7037 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7038 if (I0 == ExpectedVExtractIdx)
7039 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7040 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7041 // Try to match the following dag sequence:
7042 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7043 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7047 ExpectedVExtractIdx += 2;
7053 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7054 /// a concat_vector.
7056 /// This is a helper function of LowerToHorizontalOp().
7057 /// This function expects two 256-bit vectors called V0 and V1.
7058 /// At first, each vector is split into two separate 128-bit vectors.
7059 /// Then, the resulting 128-bit vectors are used to implement two
7060 /// horizontal binary operations.
7062 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7064 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7065 /// the two new horizontal binop.
7066 /// When Mode is set, the first horizontal binop dag node would take as input
7067 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7068 /// horizontal binop dag node would take as input the lower 128-bit of V1
7069 /// and the upper 128-bit of V1.
7071 /// HADD V0_LO, V0_HI
7072 /// HADD V1_LO, V1_HI
7074 /// Otherwise, the first horizontal binop dag node takes as input the lower
7075 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7076 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7078 /// HADD V0_LO, V1_LO
7079 /// HADD V0_HI, V1_HI
7081 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7082 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7083 /// the upper 128-bits of the result.
7084 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7085 const SDLoc &DL, SelectionDAG &DAG,
7086 unsigned X86Opcode, bool Mode,
7087 bool isUndefLO, bool isUndefHI) {
7088 MVT VT = V0.getSimpleValueType();
7089 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7090 "Invalid nodes in input!");
7092 unsigned NumElts = VT.getVectorNumElements();
7093 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7094 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7095 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7096 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7097 MVT NewVT = V0_LO.getSimpleValueType();
7099 SDValue LO = DAG.getUNDEF(NewVT);
7100 SDValue HI = DAG.getUNDEF(NewVT);
7103 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7104 if (!isUndefLO && !V0->isUndef())
7105 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7106 if (!isUndefHI && !V1->isUndef())
7107 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7109 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7110 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7111 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7113 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7114 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7117 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7120 /// Returns true iff \p BV builds a vector with the result equivalent to
7121 /// the result of ADDSUB operation.
7122 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7123 /// are written to the parameters \p Opnd0 and \p Opnd1.
7124 static bool isAddSub(const BuildVectorSDNode *BV,
7125 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7126 SDValue &Opnd0, SDValue &Opnd1) {
7128 MVT VT = BV->getSimpleValueType(0);
7129 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7130 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7131 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7134 unsigned NumElts = VT.getVectorNumElements();
7135 SDValue InVec0 = DAG.getUNDEF(VT);
7136 SDValue InVec1 = DAG.getUNDEF(VT);
7138 // Odd-numbered elements in the input build vector are obtained from
7139 // adding two integer/float elements.
7140 // Even-numbered elements in the input build vector are obtained from
7141 // subtracting two integer/float elements.
7142 unsigned ExpectedOpcode = ISD::FSUB;
7143 unsigned NextExpectedOpcode = ISD::FADD;
7144 bool AddFound = false;
7145 bool SubFound = false;
7147 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7148 SDValue Op = BV->getOperand(i);
7150 // Skip 'undef' values.
7151 unsigned Opcode = Op.getOpcode();
7152 if (Opcode == ISD::UNDEF) {
7153 std::swap(ExpectedOpcode, NextExpectedOpcode);
7157 // Early exit if we found an unexpected opcode.
7158 if (Opcode != ExpectedOpcode)
7161 SDValue Op0 = Op.getOperand(0);
7162 SDValue Op1 = Op.getOperand(1);
7164 // Try to match the following pattern:
7165 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7166 // Early exit if we cannot match that sequence.
7167 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7168 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7169 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7170 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7171 Op0.getOperand(1) != Op1.getOperand(1))
7174 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7178 // We found a valid add/sub node. Update the information accordingly.
7184 // Update InVec0 and InVec1.
7185 if (InVec0.isUndef()) {
7186 InVec0 = Op0.getOperand(0);
7187 if (InVec0.getSimpleValueType() != VT)
7190 if (InVec1.isUndef()) {
7191 InVec1 = Op1.getOperand(0);
7192 if (InVec1.getSimpleValueType() != VT)
7196 // Make sure that operands in input to each add/sub node always
7197 // come from a same pair of vectors.
7198 if (InVec0 != Op0.getOperand(0)) {
7199 if (ExpectedOpcode == ISD::FSUB)
7202 // FADD is commutable. Try to commute the operands
7203 // and then test again.
7204 std::swap(Op0, Op1);
7205 if (InVec0 != Op0.getOperand(0))
7209 if (InVec1 != Op1.getOperand(0))
7212 // Update the pair of expected opcodes.
7213 std::swap(ExpectedOpcode, NextExpectedOpcode);
7216 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7217 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7225 /// Returns true if is possible to fold MUL and an idiom that has already been
7226 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7227 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7228 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7230 /// Prior to calling this function it should be known that there is some
7231 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7232 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7233 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7234 /// of \p Opnd0 uses is expected to be equal to 2.
7235 /// For example, this function may be called for the following IR:
7236 /// %AB = fmul fast <2 x double> %A, %B
7237 /// %Sub = fsub fast <2 x double> %AB, %C
7238 /// %Add = fadd fast <2 x double> %AB, %C
7239 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7240 /// <2 x i32> <i32 0, i32 3>
7241 /// There is a def for %Addsub here, which potentially can be replaced by
7242 /// X86ISD::ADDSUB operation:
7243 /// %Addsub = X86ISD::ADDSUB %AB, %C
7244 /// and such ADDSUB can further be replaced with FMADDSUB:
7245 /// %Addsub = FMADDSUB %A, %B, %C.
7247 /// The main reason why this method is called before the replacement of the
7248 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7249 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7251 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7252 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7253 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7254 !Subtarget.hasAnyFMA())
7257 // FIXME: These checks must match the similar ones in
7258 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7259 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7260 // or MUL + ADDSUB to FMADDSUB.
7261 const TargetOptions &Options = DAG.getTarget().Options;
7263 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7268 Opnd1 = Opnd0.getOperand(1);
7269 Opnd0 = Opnd0.getOperand(0);
7274 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7275 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7276 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7277 const X86Subtarget &Subtarget,
7278 SelectionDAG &DAG) {
7279 SDValue Opnd0, Opnd1;
7280 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7283 MVT VT = BV->getSimpleValueType(0);
7286 // Try to generate X86ISD::FMADDSUB node here.
7288 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7289 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7291 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7292 // the ADDSUB idiom has been successfully recognized. There are no known
7293 // X86 targets with 512-bit ADDSUB instructions!
7294 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7296 if (VT.is512BitVector())
7299 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7302 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7303 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7304 const X86Subtarget &Subtarget,
7305 SelectionDAG &DAG) {
7306 MVT VT = BV->getSimpleValueType(0);
7307 unsigned NumElts = VT.getVectorNumElements();
7308 unsigned NumUndefsLO = 0;
7309 unsigned NumUndefsHI = 0;
7310 unsigned Half = NumElts/2;
7312 // Count the number of UNDEF operands in the build_vector in input.
7313 for (unsigned i = 0, e = Half; i != e; ++i)
7314 if (BV->getOperand(i)->isUndef())
7317 for (unsigned i = Half, e = NumElts; i != e; ++i)
7318 if (BV->getOperand(i)->isUndef())
7321 // Early exit if this is either a build_vector of all UNDEFs or all the
7322 // operands but one are UNDEF.
7323 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7327 SDValue InVec0, InVec1;
7328 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7329 // Try to match an SSE3 float HADD/HSUB.
7330 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7331 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7333 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7334 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7335 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7336 // Try to match an SSSE3 integer HADD/HSUB.
7337 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7338 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7340 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7341 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7344 if (!Subtarget.hasAVX())
7347 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7348 // Try to match an AVX horizontal add/sub of packed single/double
7349 // precision floating point values from 256-bit vectors.
7350 SDValue InVec2, InVec3;
7351 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7352 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7353 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7354 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7355 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7357 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7358 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7359 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7360 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7361 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7362 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7363 // Try to match an AVX2 horizontal add/sub of signed integers.
7364 SDValue InVec2, InVec3;
7366 bool CanFold = true;
7368 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7369 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7370 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7371 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7372 X86Opcode = X86ISD::HADD;
7373 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7374 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7375 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7376 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7377 X86Opcode = X86ISD::HSUB;
7382 // Fold this build_vector into a single horizontal add/sub.
7383 // Do this only if the target has AVX2.
7384 if (Subtarget.hasAVX2())
7385 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7387 // Do not try to expand this build_vector into a pair of horizontal
7388 // add/sub if we can emit a pair of scalar add/sub.
7389 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7392 // Convert this build_vector into a pair of horizontal binop followed by
7394 bool isUndefLO = NumUndefsLO == Half;
7395 bool isUndefHI = NumUndefsHI == Half;
7396 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7397 isUndefLO, isUndefHI);
7401 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7402 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7404 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7405 X86Opcode = X86ISD::HADD;
7406 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7407 X86Opcode = X86ISD::HSUB;
7408 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7409 X86Opcode = X86ISD::FHADD;
7410 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7411 X86Opcode = X86ISD::FHSUB;
7415 // Don't try to expand this build_vector into a pair of horizontal add/sub
7416 // if we can simply emit a pair of scalar add/sub.
7417 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7420 // Convert this build_vector into two horizontal add/sub followed by
7422 bool isUndefLO = NumUndefsLO == Half;
7423 bool isUndefHI = NumUndefsHI == Half;
7424 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7425 isUndefLO, isUndefHI);
7431 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7432 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7433 /// just apply the bit to the vectors.
7434 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7435 /// from this, but enough scalar bit operations are created from the later
7436 /// legalization + scalarization stages to need basic support.
7437 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7438 SelectionDAG &DAG) {
7440 MVT VT = Op->getSimpleValueType(0);
7441 unsigned NumElems = VT.getVectorNumElements();
7442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7444 // Check that all elements have the same opcode.
7445 // TODO: Should we allow UNDEFS and if so how many?
7446 unsigned Opcode = Op->getOperand(0).getOpcode();
7447 for (unsigned i = 1; i < NumElems; ++i)
7448 if (Opcode != Op->getOperand(i).getOpcode())
7451 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7458 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7463 SmallVector<SDValue, 4> LHSElts, RHSElts;
7464 for (SDValue Elt : Op->ops()) {
7465 SDValue LHS = Elt.getOperand(0);
7466 SDValue RHS = Elt.getOperand(1);
7468 // We expect the canonicalized RHS operand to be the constant.
7469 if (!isa<ConstantSDNode>(RHS))
7471 LHSElts.push_back(LHS);
7472 RHSElts.push_back(RHS);
7475 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7476 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7477 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7480 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7481 /// functionality to do this, so it's all zeros, all ones, or some derivation
7482 /// that is cheap to calculate.
7483 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7484 const X86Subtarget &Subtarget) {
7486 MVT VT = Op.getSimpleValueType();
7488 // Vectors containing all zeros can be matched by pxor and xorps.
7489 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7490 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7491 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7492 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7495 return getZeroVector(VT, Subtarget, DAG, DL);
7498 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7499 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7500 // vpcmpeqd on 256-bit vectors.
7501 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7502 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7503 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7506 return getOnesVector(VT, DAG, DL);
7513 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7516 MVT VT = Op.getSimpleValueType();
7517 MVT ExtVT = VT.getVectorElementType();
7518 unsigned NumElems = Op.getNumOperands();
7520 // Generate vectors for predicate vectors.
7521 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7522 return LowerBUILD_VECTORvXi1(Op, DAG);
7524 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7525 return VectorConstant;
7527 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7528 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7530 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7531 return HorizontalOp;
7532 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7534 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7537 unsigned EVTBits = ExtVT.getSizeInBits();
7539 unsigned NumZero = 0;
7540 unsigned NumNonZero = 0;
7541 uint64_t NonZeros = 0;
7542 bool IsAllConstants = true;
7543 SmallSet<SDValue, 8> Values;
7544 for (unsigned i = 0; i < NumElems; ++i) {
7545 SDValue Elt = Op.getOperand(i);
7549 if (Elt.getOpcode() != ISD::Constant &&
7550 Elt.getOpcode() != ISD::ConstantFP)
7551 IsAllConstants = false;
7552 if (X86::isZeroNode(Elt))
7555 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7556 NonZeros |= ((uint64_t)1 << i);
7561 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7562 if (NumNonZero == 0)
7563 return DAG.getUNDEF(VT);
7565 // Special case for single non-zero, non-undef, element.
7566 if (NumNonZero == 1) {
7567 unsigned Idx = countTrailingZeros(NonZeros);
7568 SDValue Item = Op.getOperand(Idx);
7570 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7571 // the value are obviously zero, truncate the value to i32 and do the
7572 // insertion that way. Only do this if the value is non-constant or if the
7573 // value is a constant being inserted into element 0. It is cheaper to do
7574 // a constant pool load than it is to do a movd + shuffle.
7575 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7576 (!IsAllConstants || Idx == 0)) {
7577 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7579 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7580 MVT VecVT = MVT::v4i32;
7582 // Truncate the value (which may itself be a constant) to i32, and
7583 // convert it to a vector with movd (S2V+shuffle to zero extend).
7584 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7585 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7586 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7587 Item, Idx * 2, true, Subtarget, DAG));
7591 // If we have a constant or non-constant insertion into the low element of
7592 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7593 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7594 // depending on what the source datatype is.
7597 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7599 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7600 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7601 assert((VT.is128BitVector() || VT.is256BitVector() ||
7602 VT.is512BitVector()) &&
7603 "Expected an SSE value type!");
7604 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7605 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7606 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7609 // We can't directly insert an i8 or i16 into a vector, so zero extend
7611 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7612 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7613 if (VT.getSizeInBits() >= 256) {
7614 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7615 if (Subtarget.hasAVX()) {
7616 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7617 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7619 // Without AVX, we need to extend to a 128-bit vector and then
7620 // insert into the 256-bit vector.
7621 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7622 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7623 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7626 assert(VT.is128BitVector() && "Expected an SSE value type!");
7627 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7628 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7630 return DAG.getBitcast(VT, Item);
7634 // Is it a vector logical left shift?
7635 if (NumElems == 2 && Idx == 1 &&
7636 X86::isZeroNode(Op.getOperand(0)) &&
7637 !X86::isZeroNode(Op.getOperand(1))) {
7638 unsigned NumBits = VT.getSizeInBits();
7639 return getVShift(true, VT,
7640 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7641 VT, Op.getOperand(1)),
7642 NumBits/2, DAG, *this, dl);
7645 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7648 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7649 // is a non-constant being inserted into an element other than the low one,
7650 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7651 // movd/movss) to move this into the low element, then shuffle it into
7653 if (EVTBits == 32) {
7654 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7655 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7659 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7660 if (Values.size() == 1) {
7661 if (EVTBits == 32) {
7662 // Instead of a shuffle like this:
7663 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7664 // Check if it's possible to issue this instead.
7665 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7666 unsigned Idx = countTrailingZeros(NonZeros);
7667 SDValue Item = Op.getOperand(Idx);
7668 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7669 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7674 // A vector full of immediates; various special cases are already
7675 // handled, so this is best done with a single constant-pool load.
7679 // See if we can use a vector load to get all of the elements.
7680 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7681 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7682 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7686 // For AVX-length vectors, build the individual 128-bit pieces and use
7687 // shuffles to put them in place.
7688 if (VT.is256BitVector() || VT.is512BitVector()) {
7689 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7691 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7693 // Build both the lower and upper subvector.
7695 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7696 SDValue Upper = DAG.getBuildVector(
7697 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7699 // Recreate the wider vector with the lower and upper part.
7700 if (VT.is256BitVector())
7701 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7702 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7705 // Let legalizer expand 2-wide build_vectors.
7706 if (EVTBits == 64) {
7707 if (NumNonZero == 1) {
7708 // One half is zero or undef.
7709 unsigned Idx = countTrailingZeros(NonZeros);
7710 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7711 Op.getOperand(Idx));
7712 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7717 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7718 if (EVTBits == 8 && NumElems == 16)
7719 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7723 if (EVTBits == 16 && NumElems == 8)
7724 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7728 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7729 if (EVTBits == 32 && NumElems == 4)
7730 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7733 // If element VT is == 32 bits, turn it into a number of shuffles.
7734 if (NumElems == 4 && NumZero > 0) {
7735 SmallVector<SDValue, 8> Ops(NumElems);
7736 for (unsigned i = 0; i < 4; ++i) {
7737 bool isZero = !(NonZeros & (1ULL << i));
7739 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7741 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7744 for (unsigned i = 0; i < 2; ++i) {
7745 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7748 Ops[i] = Ops[i*2]; // Must be a zero vector.
7751 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7754 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7757 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7762 bool Reverse1 = (NonZeros & 0x3) == 2;
7763 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7767 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7768 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7770 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7773 if (Values.size() > 1 && VT.is128BitVector()) {
7774 // Check for a build vector from mostly shuffle plus few inserting.
7775 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7778 // For SSE 4.1, use insertps to put the high elements into the low element.
7779 if (Subtarget.hasSSE41()) {
7781 if (!Op.getOperand(0).isUndef())
7782 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7784 Result = DAG.getUNDEF(VT);
7786 for (unsigned i = 1; i < NumElems; ++i) {
7787 if (Op.getOperand(i).isUndef()) continue;
7788 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7789 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7794 // Otherwise, expand into a number of unpckl*, start by extending each of
7795 // our (non-undef) elements to the full vector width with the element in the
7796 // bottom slot of the vector (which generates no code for SSE).
7797 SmallVector<SDValue, 8> Ops(NumElems);
7798 for (unsigned i = 0; i < NumElems; ++i) {
7799 if (!Op.getOperand(i).isUndef())
7800 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7802 Ops[i] = DAG.getUNDEF(VT);
7805 // Next, we iteratively mix elements, e.g. for v4f32:
7806 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7807 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7808 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7809 unsigned EltStride = NumElems >> 1;
7810 while (EltStride != 0) {
7811 for (unsigned i = 0; i < EltStride; ++i) {
7812 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7813 // then it is safe to just drop this shuffle: V[i] is already in the
7814 // right place, the one element (since it's the first round) being
7815 // inserted as undef can be dropped. This isn't safe for successive
7816 // rounds because they will permute elements within both vectors.
7817 if (Ops[i+EltStride].isUndef() &&
7818 EltStride == NumElems/2)
7821 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7830 // 256-bit AVX can use the vinsertf128 instruction
7831 // to create 256-bit vectors from two other 128-bit ones.
7832 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7834 MVT ResVT = Op.getSimpleValueType();
7836 assert((ResVT.is256BitVector() ||
7837 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7839 SDValue V1 = Op.getOperand(0);
7840 SDValue V2 = Op.getOperand(1);
7841 unsigned NumElems = ResVT.getVectorNumElements();
7842 if (ResVT.is256BitVector())
7843 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7845 if (Op.getNumOperands() == 4) {
7846 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7847 ResVT.getVectorNumElements()/2);
7848 SDValue V3 = Op.getOperand(2);
7849 SDValue V4 = Op.getOperand(3);
7850 return concat256BitVectors(
7851 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7852 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7855 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7858 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7859 const X86Subtarget &Subtarget,
7860 SelectionDAG & DAG) {
7862 MVT ResVT = Op.getSimpleValueType();
7863 unsigned NumOfOperands = Op.getNumOperands();
7865 assert(isPowerOf2_32(NumOfOperands) &&
7866 "Unexpected number of operands in CONCAT_VECTORS");
7868 SDValue Undef = DAG.getUNDEF(ResVT);
7869 if (NumOfOperands > 2) {
7870 // Specialize the cases when all, or all but one, of the operands are undef.
7871 unsigned NumOfDefinedOps = 0;
7873 for (unsigned i = 0; i < NumOfOperands; i++)
7874 if (!Op.getOperand(i).isUndef()) {
7878 if (NumOfDefinedOps == 0)
7880 if (NumOfDefinedOps == 1) {
7881 unsigned SubVecNumElts =
7882 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7883 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7884 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7885 Op.getOperand(OpIdx), IdxVal);
7888 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7889 ResVT.getVectorNumElements()/2);
7890 SmallVector<SDValue, 2> Ops;
7891 for (unsigned i = 0; i < NumOfOperands/2; i++)
7892 Ops.push_back(Op.getOperand(i));
7893 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7895 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7896 Ops.push_back(Op.getOperand(i));
7897 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7898 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7902 SDValue V1 = Op.getOperand(0);
7903 SDValue V2 = Op.getOperand(1);
7904 unsigned NumElems = ResVT.getVectorNumElements();
7905 assert(V1.getValueType() == V2.getValueType() &&
7906 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7907 "Unexpected operands in CONCAT_VECTORS");
7909 if (ResVT.getSizeInBits() >= 16)
7910 return Op; // The operation is legal with KUNPCK
7912 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7913 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7914 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7915 if (IsZeroV1 && IsZeroV2)
7918 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7920 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7922 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7924 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7926 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7929 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7931 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7932 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7935 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7936 const X86Subtarget &Subtarget,
7937 SelectionDAG &DAG) {
7938 MVT VT = Op.getSimpleValueType();
7939 if (VT.getVectorElementType() == MVT::i1)
7940 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7942 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7943 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7944 Op.getNumOperands() == 4)));
7946 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7947 // from two other 128-bit ones.
7949 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7950 return LowerAVXCONCAT_VECTORS(Op, DAG);
7953 //===----------------------------------------------------------------------===//
7954 // Vector shuffle lowering
7956 // This is an experimental code path for lowering vector shuffles on x86. It is
7957 // designed to handle arbitrary vector shuffles and blends, gracefully
7958 // degrading performance as necessary. It works hard to recognize idiomatic
7959 // shuffles and lower them to optimal instruction patterns without leaving
7960 // a framework that allows reasonably efficient handling of all vector shuffle
7962 //===----------------------------------------------------------------------===//
7964 /// \brief Tiny helper function to identify a no-op mask.
7966 /// This is a somewhat boring predicate function. It checks whether the mask
7967 /// array input, which is assumed to be a single-input shuffle mask of the kind
7968 /// used by the X86 shuffle instructions (not a fully general
7969 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7970 /// in-place shuffle are 'no-op's.
7971 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7972 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7973 assert(Mask[i] >= -1 && "Out of bound mask element!");
7974 if (Mask[i] >= 0 && Mask[i] != i)
7980 /// \brief Test whether there are elements crossing 128-bit lanes in this
7983 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7984 /// and we routinely test for these.
7985 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7986 int LaneSize = 128 / VT.getScalarSizeInBits();
7987 int Size = Mask.size();
7988 for (int i = 0; i < Size; ++i)
7989 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7994 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7996 /// This checks a shuffle mask to see if it is performing the same
7997 /// lane-relative shuffle in each sub-lane. This trivially implies
7998 /// that it is also not lane-crossing. It may however involve a blend from the
7999 /// same lane of a second vector.
8001 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8002 /// non-trivial to compute in the face of undef lanes. The representation is
8003 /// suitable for use with existing 128-bit shuffles as entries from the second
8004 /// vector have been remapped to [LaneSize, 2*LaneSize).
8005 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8007 SmallVectorImpl<int> &RepeatedMask) {
8008 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8009 RepeatedMask.assign(LaneSize, -1);
8010 int Size = Mask.size();
8011 for (int i = 0; i < Size; ++i) {
8012 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8015 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8016 // This entry crosses lanes, so there is no way to model this shuffle.
8019 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8020 // Adjust second vector indices to start at LaneSize instead of Size.
8021 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8022 : Mask[i] % LaneSize + LaneSize;
8023 if (RepeatedMask[i % LaneSize] < 0)
8024 // This is the first non-undef entry in this slot of a 128-bit lane.
8025 RepeatedMask[i % LaneSize] = LocalM;
8026 else if (RepeatedMask[i % LaneSize] != LocalM)
8027 // Found a mismatch with the repeated mask.
8033 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8035 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8036 SmallVectorImpl<int> &RepeatedMask) {
8037 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8040 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8042 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8043 SmallVectorImpl<int> &RepeatedMask) {
8044 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8047 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8048 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8049 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8051 SmallVectorImpl<int> &RepeatedMask) {
8052 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8053 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8054 int Size = Mask.size();
8055 for (int i = 0; i < Size; ++i) {
8056 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8057 if (Mask[i] == SM_SentinelUndef)
8059 if (Mask[i] == SM_SentinelZero) {
8060 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8062 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8065 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8066 // This entry crosses lanes, so there is no way to model this shuffle.
8069 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8070 // Adjust second vector indices to start at LaneSize instead of Size.
8072 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8073 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8074 // This is the first non-undef entry in this slot of a 128-bit lane.
8075 RepeatedMask[i % LaneSize] = LocalM;
8076 else if (RepeatedMask[i % LaneSize] != LocalM)
8077 // Found a mismatch with the repeated mask.
8083 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8086 /// This is a fast way to test a shuffle mask against a fixed pattern:
8088 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8090 /// It returns true if the mask is exactly as wide as the argument list, and
8091 /// each element of the mask is either -1 (signifying undef) or the value given
8092 /// in the argument.
8093 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8094 ArrayRef<int> ExpectedMask) {
8095 if (Mask.size() != ExpectedMask.size())
8098 int Size = Mask.size();
8100 // If the values are build vectors, we can look through them to find
8101 // equivalent inputs that make the shuffles equivalent.
8102 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8103 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8105 for (int i = 0; i < Size; ++i) {
8106 assert(Mask[i] >= -1 && "Out of bound mask element!");
8107 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8108 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8109 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8110 if (!MaskBV || !ExpectedBV ||
8111 MaskBV->getOperand(Mask[i] % Size) !=
8112 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8120 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8122 /// The masks must be exactly the same width.
8124 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8125 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8127 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8128 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8129 ArrayRef<int> ExpectedMask) {
8130 int Size = Mask.size();
8131 if (Size != (int)ExpectedMask.size())
8134 for (int i = 0; i < Size; ++i)
8135 if (Mask[i] == SM_SentinelUndef)
8137 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8139 else if (Mask[i] != ExpectedMask[i])
8145 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8147 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8148 const APInt &Zeroable) {
8149 int NumElts = Mask.size();
8150 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8152 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8153 for (int i = 0; i != NumElts; ++i) {
8155 if (M == SM_SentinelUndef)
8157 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8158 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8163 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8165 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8166 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8169 SmallVector<int, 8> Unpcklwd;
8170 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8171 /* Unary = */ false);
8172 SmallVector<int, 8> Unpckhwd;
8173 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8174 /* Unary = */ false);
8175 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8176 isTargetShuffleEquivalent(Mask, Unpckhwd));
8177 return IsUnpackwdMask;
8180 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8182 /// This helper function produces an 8-bit shuffle immediate corresponding to
8183 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8184 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8187 /// NB: We rely heavily on "undef" masks preserving the input lane.
8188 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8189 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8190 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8191 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8192 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8193 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8196 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8197 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8198 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8199 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8203 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8204 SelectionDAG &DAG) {
8205 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8208 /// \brief Compute whether each element of a shuffle is zeroable.
8210 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8211 /// Either it is an undef element in the shuffle mask, the element of the input
8212 /// referenced is undef, or the element of the input referenced is known to be
8213 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8214 /// as many lanes with this technique as possible to simplify the remaining
8216 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8217 SDValue V1, SDValue V2) {
8218 APInt Zeroable(Mask.size(), 0);
8219 V1 = peekThroughBitcasts(V1);
8220 V2 = peekThroughBitcasts(V2);
8222 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8223 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8225 int VectorSizeInBits = V1.getValueSizeInBits();
8226 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8227 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8229 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8231 // Handle the easy cases.
8232 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8237 // Determine shuffle input and normalize the mask.
8238 SDValue V = M < Size ? V1 : V2;
8241 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8242 if (V.getOpcode() != ISD::BUILD_VECTOR)
8245 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8246 // the (larger) source element must be UNDEF/ZERO.
8247 if ((Size % V.getNumOperands()) == 0) {
8248 int Scale = Size / V->getNumOperands();
8249 SDValue Op = V.getOperand(M / Scale);
8250 if (Op.isUndef() || X86::isZeroNode(Op))
8252 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8253 APInt Val = Cst->getAPIntValue();
8254 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8255 Val = Val.getLoBits(ScalarSizeInBits);
8258 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8259 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8260 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8261 Val = Val.getLoBits(ScalarSizeInBits);
8268 // If the BUILD_VECTOR has more elements then all the (smaller) source
8269 // elements must be UNDEF or ZERO.
8270 if ((V.getNumOperands() % Size) == 0) {
8271 int Scale = V->getNumOperands() / Size;
8272 bool AllZeroable = true;
8273 for (int j = 0; j < Scale; ++j) {
8274 SDValue Op = V.getOperand((M * Scale) + j);
8275 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8286 // The Shuffle result is as follow:
8287 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8288 // Each Zeroable's element correspond to a particular Mask's element.
8289 // As described in computeZeroableShuffleElements function.
8291 // The function looks for a sub-mask that the nonzero elements are in
8292 // increasing order. If such sub-mask exist. The function returns true.
8293 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8294 ArrayRef<int> Mask, const EVT &VectorType,
8295 bool &IsZeroSideLeft) {
8296 int NextElement = -1;
8297 // Check if the Mask's nonzero elements are in increasing order.
8298 for (int i = 0, e = Mask.size(); i < e; i++) {
8299 // Checks if the mask's zeros elements are built from only zeros.
8300 assert(Mask[i] >= -1 && "Out of bound mask element!");
8305 // Find the lowest non zero element
8306 if (NextElement < 0) {
8307 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8308 IsZeroSideLeft = NextElement != 0;
8310 // Exit if the mask's non zero elements are not in increasing order.
8311 if (NextElement != Mask[i])
8318 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8319 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8320 ArrayRef<int> Mask, SDValue V1,
8322 const APInt &Zeroable,
8323 const X86Subtarget &Subtarget,
8324 SelectionDAG &DAG) {
8325 int Size = Mask.size();
8326 int LaneSize = 128 / VT.getScalarSizeInBits();
8327 const int NumBytes = VT.getSizeInBits() / 8;
8328 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8330 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8331 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8332 (Subtarget.hasBWI() && VT.is512BitVector()));
8334 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8335 // Sign bit set in i8 mask means zero element.
8336 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8339 for (int i = 0; i < NumBytes; ++i) {
8340 int M = Mask[i / NumEltBytes];
8342 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8345 if (Zeroable[i / NumEltBytes]) {
8346 PSHUFBMask[i] = ZeroMask;
8350 // We can only use a single input of V1 or V2.
8351 SDValue SrcV = (M >= Size ? V2 : V1);
8357 // PSHUFB can't cross lanes, ensure this doesn't happen.
8358 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8362 M = M * NumEltBytes + (i % NumEltBytes);
8363 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8365 assert(V && "Failed to find a source input");
8367 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8368 return DAG.getBitcast(
8369 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8370 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8373 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8374 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8377 // X86 has dedicated shuffle that can be lowered to VEXPAND
8378 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8379 const APInt &Zeroable,
8380 ArrayRef<int> Mask, SDValue &V1,
8381 SDValue &V2, SelectionDAG &DAG,
8382 const X86Subtarget &Subtarget) {
8383 bool IsLeftZeroSide = true;
8384 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8387 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8389 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8390 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8391 unsigned NumElts = VT.getVectorNumElements();
8392 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8393 "Unexpected number of vector elements");
8394 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8395 Subtarget, DAG, DL);
8396 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8397 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8398 return DAG.getSelect(DL, VT, VMask,
8399 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8403 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8404 unsigned &UnpackOpcode, bool IsUnary,
8405 ArrayRef<int> TargetMask, SDLoc &DL,
8407 const X86Subtarget &Subtarget) {
8408 int NumElts = VT.getVectorNumElements();
8410 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8411 for (int i = 0; i != NumElts; i += 2) {
8412 int M1 = TargetMask[i + 0];
8413 int M2 = TargetMask[i + 1];
8414 Undef1 &= (SM_SentinelUndef == M1);
8415 Undef2 &= (SM_SentinelUndef == M2);
8416 Zero1 &= isUndefOrZero(M1);
8417 Zero2 &= isUndefOrZero(M2);
8419 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8420 "Zeroable shuffle detected");
8422 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8423 SmallVector<int, 64> Unpckl, Unpckh;
8424 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8425 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8426 UnpackOpcode = X86ISD::UNPCKL;
8427 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8428 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8432 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8433 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8434 UnpackOpcode = X86ISD::UNPCKH;
8435 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8436 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8440 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8441 if (IsUnary && (Zero1 || Zero2)) {
8442 // Don't bother if we can blend instead.
8443 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8444 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8447 bool MatchLo = true, MatchHi = true;
8448 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8449 int M = TargetMask[i];
8451 // Ignore if the input is known to be zero or the index is undef.
8452 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8453 (M == SM_SentinelUndef))
8456 MatchLo &= (M == Unpckl[i]);
8457 MatchHi &= (M == Unpckh[i]);
8460 if (MatchLo || MatchHi) {
8461 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8462 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8463 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8468 // If a binary shuffle, commute and try again.
8470 ShuffleVectorSDNode::commuteMask(Unpckl);
8471 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8472 UnpackOpcode = X86ISD::UNPCKL;
8477 ShuffleVectorSDNode::commuteMask(Unpckh);
8478 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8479 UnpackOpcode = X86ISD::UNPCKH;
8488 // X86 has dedicated unpack instructions that can handle specific blend
8489 // operations: UNPCKH and UNPCKL.
8490 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8491 ArrayRef<int> Mask, SDValue V1,
8492 SDValue V2, SelectionDAG &DAG) {
8493 SmallVector<int, 8> Unpckl;
8494 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8495 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8496 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8498 SmallVector<int, 8> Unpckh;
8499 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8500 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8501 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8503 // Commute and try again.
8504 ShuffleVectorSDNode::commuteMask(Unpckl);
8505 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8506 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8508 ShuffleVectorSDNode::commuteMask(Unpckh);
8509 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8510 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8515 /// \brief Try to emit a bitmask instruction for a shuffle.
8517 /// This handles cases where we can model a blend exactly as a bitmask due to
8518 /// one of the inputs being zeroable.
8519 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8520 SDValue V2, ArrayRef<int> Mask,
8521 const APInt &Zeroable,
8522 SelectionDAG &DAG) {
8523 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8524 MVT EltVT = VT.getVectorElementType();
8525 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8526 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8527 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8529 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8532 if (Mask[i] % Size != i)
8533 return SDValue(); // Not a blend.
8535 V = Mask[i] < Size ? V1 : V2;
8536 else if (V != (Mask[i] < Size ? V1 : V2))
8537 return SDValue(); // Can only let one input through the mask.
8539 VMaskOps[i] = AllOnes;
8542 return SDValue(); // No non-zeroable elements!
8544 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8545 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8548 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8550 /// This is used as a fallback approach when first class blend instructions are
8551 /// unavailable. Currently it is only suitable for integer vectors, but could
8552 /// be generalized for floating point vectors if desirable.
8553 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8554 SDValue V2, ArrayRef<int> Mask,
8555 SelectionDAG &DAG) {
8556 assert(VT.isInteger() && "Only supports integer vector types!");
8557 MVT EltVT = VT.getVectorElementType();
8558 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8559 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8560 SmallVector<SDValue, 16> MaskOps;
8561 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8562 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8563 return SDValue(); // Shuffled input!
8564 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8567 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8568 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8569 // We have to cast V2 around.
8570 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8571 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8572 DAG.getBitcast(MaskVT, V1Mask),
8573 DAG.getBitcast(MaskVT, V2)));
8574 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8577 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8578 SDValue PreservedSrc,
8579 const X86Subtarget &Subtarget,
8582 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8583 MutableArrayRef<int> TargetMask,
8584 bool &ForceV1Zero, bool &ForceV2Zero,
8585 uint64_t &BlendMask) {
8586 bool V1IsZeroOrUndef =
8587 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8588 bool V2IsZeroOrUndef =
8589 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8592 ForceV1Zero = false, ForceV2Zero = false;
8593 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8595 // Attempt to generate the binary blend mask. If an input is zero then
8596 // we can use any lane.
8597 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8598 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8599 int M = TargetMask[i];
8600 if (M == SM_SentinelUndef)
8604 if (M == i + Size) {
8605 BlendMask |= 1ull << i;
8608 if (M == SM_SentinelZero) {
8609 if (V1IsZeroOrUndef) {
8614 if (V2IsZeroOrUndef) {
8616 BlendMask |= 1ull << i;
8617 TargetMask[i] = i + Size;
8626 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8627 uint64_t ScaledMask = 0;
8628 for (int i = 0; i != Size; ++i)
8629 if (BlendMask & (1ull << i))
8630 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8634 /// \brief Try to emit a blend instruction for a shuffle.
8636 /// This doesn't do any checks for the availability of instructions for blending
8637 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8638 /// be matched in the backend with the type given. What it does check for is
8639 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8640 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8641 SDValue V2, ArrayRef<int> Original,
8642 const APInt &Zeroable,
8643 const X86Subtarget &Subtarget,
8644 SelectionDAG &DAG) {
8645 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8647 uint64_t BlendMask = 0;
8648 bool ForceV1Zero = false, ForceV2Zero = false;
8649 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8653 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8655 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8657 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8659 switch (VT.SimpleTy) {
8664 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8665 DAG.getConstant(BlendMask, DL, MVT::i8));
8669 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8673 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8674 // that instruction.
8675 if (Subtarget.hasAVX2()) {
8676 // Scale the blend by the number of 32-bit dwords per element.
8677 int Scale = VT.getScalarSizeInBits() / 32;
8678 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8679 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8680 V1 = DAG.getBitcast(BlendVT, V1);
8681 V2 = DAG.getBitcast(BlendVT, V2);
8682 return DAG.getBitcast(
8683 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8684 DAG.getConstant(BlendMask, DL, MVT::i8)));
8688 // For integer shuffles we need to expand the mask and cast the inputs to
8689 // v8i16s prior to blending.
8690 int Scale = 8 / VT.getVectorNumElements();
8691 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8692 V1 = DAG.getBitcast(MVT::v8i16, V1);
8693 V2 = DAG.getBitcast(MVT::v8i16, V2);
8694 return DAG.getBitcast(VT,
8695 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8696 DAG.getConstant(BlendMask, DL, MVT::i8)));
8700 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8701 SmallVector<int, 8> RepeatedMask;
8702 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8703 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8704 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8706 for (int i = 0; i < 8; ++i)
8707 if (RepeatedMask[i] >= 8)
8708 BlendMask |= 1ull << i;
8709 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8710 DAG.getConstant(BlendMask, DL, MVT::i8));
8716 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8717 "256-bit byte-blends require AVX2 support!");
8719 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8721 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8722 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8723 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8726 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8727 if (SDValue Masked =
8728 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8731 // Scale the blend by the number of bytes per element.
8732 int Scale = VT.getScalarSizeInBits() / 8;
8734 // This form of blend is always done on bytes. Compute the byte vector
8736 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8738 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8739 // mix of LLVM's code generator and the x86 backend. We tell the code
8740 // generator that boolean values in the elements of an x86 vector register
8741 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8742 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8743 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8744 // of the element (the remaining are ignored) and 0 in that high bit would
8745 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8746 // the LLVM model for boolean values in vector elements gets the relevant
8747 // bit set, it is set backwards and over constrained relative to x86's
8749 SmallVector<SDValue, 32> VSELECTMask;
8750 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8751 for (int j = 0; j < Scale; ++j)
8752 VSELECTMask.push_back(
8753 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8754 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8757 V1 = DAG.getBitcast(BlendVT, V1);
8758 V2 = DAG.getBitcast(BlendVT, V2);
8759 return DAG.getBitcast(
8761 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8771 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8772 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8773 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8776 llvm_unreachable("Not a supported integer vector type!");
8780 /// \brief Try to lower as a blend of elements from two inputs followed by
8781 /// a single-input permutation.
8783 /// This matches the pattern where we can blend elements from two inputs and
8784 /// then reduce the shuffle to a single-input permutation.
8785 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8786 SDValue V1, SDValue V2,
8788 SelectionDAG &DAG) {
8789 // We build up the blend mask while checking whether a blend is a viable way
8790 // to reduce the shuffle.
8791 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8792 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8794 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8798 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8800 if (BlendMask[Mask[i] % Size] < 0)
8801 BlendMask[Mask[i] % Size] = Mask[i];
8802 else if (BlendMask[Mask[i] % Size] != Mask[i])
8803 return SDValue(); // Can't blend in the needed input!
8805 PermuteMask[i] = Mask[i] % Size;
8808 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8809 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8812 /// \brief Generic routine to decompose a shuffle and blend into independent
8813 /// blends and permutes.
8815 /// This matches the extremely common pattern for handling combined
8816 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8817 /// operations. It will try to pick the best arrangement of shuffles and
8819 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8823 SelectionDAG &DAG) {
8824 // Shuffle the input elements into the desired positions in V1 and V2 and
8825 // blend them together.
8826 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8827 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8828 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8829 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8830 if (Mask[i] >= 0 && Mask[i] < Size) {
8831 V1Mask[i] = Mask[i];
8833 } else if (Mask[i] >= Size) {
8834 V2Mask[i] = Mask[i] - Size;
8835 BlendMask[i] = i + Size;
8838 // Try to lower with the simpler initial blend strategy unless one of the
8839 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8840 // shuffle may be able to fold with a load or other benefit. However, when
8841 // we'll have to do 2x as many shuffles in order to achieve this, blending
8842 // first is a better strategy.
8843 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8844 if (SDValue BlendPerm =
8845 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8848 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8849 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8850 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8853 /// \brief Try to lower a vector shuffle as a rotation.
8855 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8856 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8857 ArrayRef<int> Mask) {
8858 int NumElts = Mask.size();
8860 // We need to detect various ways of spelling a rotation:
8861 // [11, 12, 13, 14, 15, 0, 1, 2]
8862 // [-1, 12, 13, 14, -1, -1, 1, -1]
8863 // [-1, -1, -1, -1, -1, -1, 1, 2]
8864 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8865 // [-1, 4, 5, 6, -1, -1, 9, -1]
8866 // [-1, 4, 5, 6, -1, -1, -1, -1]
8869 for (int i = 0; i < NumElts; ++i) {
8871 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8872 "Unexpected mask index.");
8876 // Determine where a rotated vector would have started.
8877 int StartIdx = i - (M % NumElts);
8879 // The identity rotation isn't interesting, stop.
8882 // If we found the tail of a vector the rotation must be the missing
8883 // front. If we found the head of a vector, it must be how much of the
8885 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8888 Rotation = CandidateRotation;
8889 else if (Rotation != CandidateRotation)
8890 // The rotations don't match, so we can't match this mask.
8893 // Compute which value this mask is pointing at.
8894 SDValue MaskV = M < NumElts ? V1 : V2;
8896 // Compute which of the two target values this index should be assigned
8897 // to. This reflects whether the high elements are remaining or the low
8898 // elements are remaining.
8899 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8901 // Either set up this value if we've not encountered it before, or check
8902 // that it remains consistent.
8905 else if (TargetV != MaskV)
8906 // This may be a rotation, but it pulls from the inputs in some
8907 // unsupported interleaving.
8911 // Check that we successfully analyzed the mask, and normalize the results.
8912 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8913 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8925 /// \brief Try to lower a vector shuffle as a byte rotation.
8927 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8928 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8929 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8930 /// try to generically lower a vector shuffle through such an pattern. It
8931 /// does not check for the profitability of lowering either as PALIGNR or
8932 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8933 /// This matches shuffle vectors that look like:
8935 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8937 /// Essentially it concatenates V1 and V2, shifts right by some number of
8938 /// elements, and takes the low elements as the result. Note that while this is
8939 /// specified as a *right shift* because x86 is little-endian, it is a *left
8940 /// rotate* of the vector lanes.
8941 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8942 ArrayRef<int> Mask) {
8943 // Don't accept any shuffles with zero elements.
8944 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8947 // PALIGNR works on 128-bit lanes.
8948 SmallVector<int, 16> RepeatedMask;
8949 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8952 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8956 // PALIGNR rotates bytes, so we need to scale the
8957 // rotation based on how many bytes are in the vector lane.
8958 int NumElts = RepeatedMask.size();
8959 int Scale = 16 / NumElts;
8960 return Rotation * Scale;
8963 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8964 SDValue V1, SDValue V2,
8966 const X86Subtarget &Subtarget,
8967 SelectionDAG &DAG) {
8968 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8970 SDValue Lo = V1, Hi = V2;
8971 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8972 if (ByteRotation <= 0)
8975 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8977 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8978 Lo = DAG.getBitcast(ByteVT, Lo);
8979 Hi = DAG.getBitcast(ByteVT, Hi);
8981 // SSSE3 targets can use the palignr instruction.
8982 if (Subtarget.hasSSSE3()) {
8983 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8984 "512-bit PALIGNR requires BWI instructions");
8985 return DAG.getBitcast(
8986 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8987 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8990 assert(VT.is128BitVector() &&
8991 "Rotate-based lowering only supports 128-bit lowering!");
8992 assert(Mask.size() <= 16 &&
8993 "Can shuffle at most 16 bytes in a 128-bit vector!");
8994 assert(ByteVT == MVT::v16i8 &&
8995 "SSE2 rotate lowering only needed for v16i8!");
8997 // Default SSE2 implementation
8998 int LoByteShift = 16 - ByteRotation;
8999 int HiByteShift = ByteRotation;
9001 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9002 DAG.getConstant(LoByteShift, DL, MVT::i8));
9003 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9004 DAG.getConstant(HiByteShift, DL, MVT::i8));
9005 return DAG.getBitcast(VT,
9006 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9009 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9011 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9012 /// rotation of the concatenation of two vectors; This routine will
9013 /// try to generically lower a vector shuffle through such an pattern.
9015 /// Essentially it concatenates V1 and V2, shifts right by some number of
9016 /// elements, and takes the low elements as the result. Note that while this is
9017 /// specified as a *right shift* because x86 is little-endian, it is a *left
9018 /// rotate* of the vector lanes.
9019 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9020 SDValue V1, SDValue V2,
9022 const X86Subtarget &Subtarget,
9023 SelectionDAG &DAG) {
9024 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9025 "Only 32-bit and 64-bit elements are supported!");
9027 // 128/256-bit vectors are only supported with VLX.
9028 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9029 && "VLX required for 128/256-bit vectors");
9031 SDValue Lo = V1, Hi = V2;
9032 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9036 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9037 DAG.getConstant(Rotation, DL, MVT::i8));
9040 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9042 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9043 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9044 /// matches elements from one of the input vectors shuffled to the left or
9045 /// right with zeroable elements 'shifted in'. It handles both the strictly
9046 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9049 /// PSHL : (little-endian) left bit shift.
9050 /// [ zz, 0, zz, 2 ]
9051 /// [ -1, 4, zz, -1 ]
9052 /// PSRL : (little-endian) right bit shift.
9054 /// [ -1, -1, 7, zz]
9055 /// PSLLDQ : (little-endian) left byte shift
9056 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9057 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9058 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9059 /// PSRLDQ : (little-endian) right byte shift
9060 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9061 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9062 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9063 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9064 unsigned ScalarSizeInBits,
9065 ArrayRef<int> Mask, int MaskOffset,
9066 const APInt &Zeroable,
9067 const X86Subtarget &Subtarget) {
9068 int Size = Mask.size();
9069 unsigned SizeInBits = Size * ScalarSizeInBits;
9071 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9072 for (int i = 0; i < Size; i += Scale)
9073 for (int j = 0; j < Shift; ++j)
9074 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9080 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9081 for (int i = 0; i != Size; i += Scale) {
9082 unsigned Pos = Left ? i + Shift : i;
9083 unsigned Low = Left ? i : i + Shift;
9084 unsigned Len = Scale - Shift;
9085 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9089 int ShiftEltBits = ScalarSizeInBits * Scale;
9090 bool ByteShift = ShiftEltBits > 64;
9091 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9092 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9093 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9095 // Normalize the scale for byte shifts to still produce an i64 element
9097 Scale = ByteShift ? Scale / 2 : Scale;
9099 // We need to round trip through the appropriate type for the shift.
9100 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9101 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9102 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9103 return (int)ShiftAmt;
9106 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9107 // keep doubling the size of the integer elements up to that. We can
9108 // then shift the elements of the integer vector by whole multiples of
9109 // their width within the elements of the larger integer vector. Test each
9110 // multiple to see if we can find a match with the moved element indices
9111 // and that the shifted in elements are all zeroable.
9112 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9113 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9114 for (int Shift = 1; Shift != Scale; ++Shift)
9115 for (bool Left : {true, false})
9116 if (CheckZeros(Shift, Scale, Left)) {
9117 int ShiftAmt = MatchShift(Shift, Scale, Left);
9126 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9127 SDValue V2, ArrayRef<int> Mask,
9128 const APInt &Zeroable,
9129 const X86Subtarget &Subtarget,
9130 SelectionDAG &DAG) {
9131 int Size = Mask.size();
9132 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9138 // Try to match shuffle against V1 shift.
9139 int ShiftAmt = matchVectorShuffleAsShift(
9140 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9142 // If V1 failed, try to match shuffle against V2 shift.
9145 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9146 Mask, Size, Zeroable, Subtarget);
9153 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9154 "Illegal integer vector type");
9155 V = DAG.getBitcast(ShiftVT, V);
9156 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9157 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9158 return DAG.getBitcast(VT, V);
9161 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9162 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9163 SDValue V2, ArrayRef<int> Mask,
9164 const APInt &Zeroable,
9165 SelectionDAG &DAG) {
9166 int Size = Mask.size();
9167 int HalfSize = Size / 2;
9168 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9169 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9171 // Upper half must be undefined.
9172 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9175 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9176 // Remainder of lower half result is zero and upper half is all undef.
9177 auto LowerAsEXTRQ = [&]() {
9178 // Determine the extraction length from the part of the
9179 // lower half that isn't zeroable.
9181 for (; Len > 0; --Len)
9182 if (!Zeroable[Len - 1])
9184 assert(Len > 0 && "Zeroable shuffle mask");
9186 // Attempt to match first Len sequential elements from the lower half.
9189 for (int i = 0; i != Len; ++i) {
9193 SDValue &V = (M < Size ? V1 : V2);
9196 // The extracted elements must start at a valid index and all mask
9197 // elements must be in the lower half.
9198 if (i > M || M >= HalfSize)
9201 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9212 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9213 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9214 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9215 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9216 DAG.getConstant(BitLen, DL, MVT::i8),
9217 DAG.getConstant(BitIdx, DL, MVT::i8));
9220 if (SDValue ExtrQ = LowerAsEXTRQ())
9223 // INSERTQ: Extract lowest Len elements from lower half of second source and
9224 // insert over first source, starting at Idx.
9225 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9226 auto LowerAsInsertQ = [&]() {
9227 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9230 // Attempt to match first source from mask before insertion point.
9231 if (isUndefInRange(Mask, 0, Idx)) {
9233 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9235 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9241 // Extend the extraction length looking to match both the insertion of
9242 // the second source and the remaining elements of the first.
9243 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9248 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9250 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9256 // Match the remaining elements of the lower half.
9257 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9259 } else if ((!Base || (Base == V1)) &&
9260 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9262 } else if ((!Base || (Base == V2)) &&
9263 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9270 // We may not have a base (first source) - this can safely be undefined.
9272 Base = DAG.getUNDEF(VT);
9274 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9275 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9276 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9277 DAG.getConstant(BitLen, DL, MVT::i8),
9278 DAG.getConstant(BitIdx, DL, MVT::i8));
9285 if (SDValue InsertQ = LowerAsInsertQ())
9291 /// \brief Lower a vector shuffle as a zero or any extension.
9293 /// Given a specific number of elements, element bit width, and extension
9294 /// stride, produce either a zero or any extension based on the available
9295 /// features of the subtarget. The extended elements are consecutive and
9296 /// begin and can start from an offsetted element index in the input; to
9297 /// avoid excess shuffling the offset must either being in the bottom lane
9298 /// or at the start of a higher lane. All extended elements must be from
9300 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9301 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9302 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9303 assert(Scale > 1 && "Need a scale to extend.");
9304 int EltBits = VT.getScalarSizeInBits();
9305 int NumElements = VT.getVectorNumElements();
9306 int NumEltsPerLane = 128 / EltBits;
9307 int OffsetLane = Offset / NumEltsPerLane;
9308 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9309 "Only 8, 16, and 32 bit elements can be extended.");
9310 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9311 assert(0 <= Offset && "Extension offset must be positive.");
9312 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9313 "Extension offset must be in the first lane or start an upper lane.");
9315 // Check that an index is in same lane as the base offset.
9316 auto SafeOffset = [&](int Idx) {
9317 return OffsetLane == (Idx / NumEltsPerLane);
9320 // Shift along an input so that the offset base moves to the first element.
9321 auto ShuffleOffset = [&](SDValue V) {
9325 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9326 for (int i = 0; i * Scale < NumElements; ++i) {
9327 int SrcIdx = i + Offset;
9328 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9330 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9333 // Found a valid zext mask! Try various lowering strategies based on the
9334 // input type and available ISA extensions.
9335 if (Subtarget.hasSSE41()) {
9336 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9337 // PUNPCK will catch this in a later shuffle match.
9338 if (Offset && Scale == 2 && VT.is128BitVector())
9340 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9341 NumElements / Scale);
9342 InputV = ShuffleOffset(InputV);
9343 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9344 return DAG.getBitcast(VT, InputV);
9347 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9349 // For any extends we can cheat for larger element sizes and use shuffle
9350 // instructions that can fold with a load and/or copy.
9351 if (AnyExt && EltBits == 32) {
9352 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9354 return DAG.getBitcast(
9355 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9356 DAG.getBitcast(MVT::v4i32, InputV),
9357 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9359 if (AnyExt && EltBits == 16 && Scale > 2) {
9360 int PSHUFDMask[4] = {Offset / 2, -1,
9361 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9362 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9363 DAG.getBitcast(MVT::v4i32, InputV),
9364 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9365 int PSHUFWMask[4] = {1, -1, -1, -1};
9366 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9367 return DAG.getBitcast(
9368 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9369 DAG.getBitcast(MVT::v8i16, InputV),
9370 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9373 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9375 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9376 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9377 assert(VT.is128BitVector() && "Unexpected vector width!");
9379 int LoIdx = Offset * EltBits;
9380 SDValue Lo = DAG.getBitcast(
9381 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9382 DAG.getConstant(EltBits, DL, MVT::i8),
9383 DAG.getConstant(LoIdx, DL, MVT::i8)));
9385 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9386 !SafeOffset(Offset + 1))
9387 return DAG.getBitcast(VT, Lo);
9389 int HiIdx = (Offset + 1) * EltBits;
9390 SDValue Hi = DAG.getBitcast(
9391 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9392 DAG.getConstant(EltBits, DL, MVT::i8),
9393 DAG.getConstant(HiIdx, DL, MVT::i8)));
9394 return DAG.getBitcast(VT,
9395 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9398 // If this would require more than 2 unpack instructions to expand, use
9399 // pshufb when available. We can only use more than 2 unpack instructions
9400 // when zero extending i8 elements which also makes it easier to use pshufb.
9401 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9402 assert(NumElements == 16 && "Unexpected byte vector width!");
9403 SDValue PSHUFBMask[16];
9404 for (int i = 0; i < 16; ++i) {
9405 int Idx = Offset + (i / Scale);
9406 PSHUFBMask[i] = DAG.getConstant(
9407 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9409 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9410 return DAG.getBitcast(
9411 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9412 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9415 // If we are extending from an offset, ensure we start on a boundary that
9416 // we can unpack from.
9417 int AlignToUnpack = Offset % (NumElements / Scale);
9418 if (AlignToUnpack) {
9419 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9420 for (int i = AlignToUnpack; i < NumElements; ++i)
9421 ShMask[i - AlignToUnpack] = i;
9422 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9423 Offset -= AlignToUnpack;
9426 // Otherwise emit a sequence of unpacks.
9428 unsigned UnpackLoHi = X86ISD::UNPCKL;
9429 if (Offset >= (NumElements / 2)) {
9430 UnpackLoHi = X86ISD::UNPCKH;
9431 Offset -= (NumElements / 2);
9434 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9435 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9436 : getZeroVector(InputVT, Subtarget, DAG, DL);
9437 InputV = DAG.getBitcast(InputVT, InputV);
9438 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9442 } while (Scale > 1);
9443 return DAG.getBitcast(VT, InputV);
9446 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9448 /// This routine will try to do everything in its power to cleverly lower
9449 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9450 /// check for the profitability of this lowering, it tries to aggressively
9451 /// match this pattern. It will use all of the micro-architectural details it
9452 /// can to emit an efficient lowering. It handles both blends with all-zero
9453 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9454 /// masking out later).
9456 /// The reason we have dedicated lowering for zext-style shuffles is that they
9457 /// are both incredibly common and often quite performance sensitive.
9458 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9459 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9460 const APInt &Zeroable, const X86Subtarget &Subtarget,
9461 SelectionDAG &DAG) {
9462 int Bits = VT.getSizeInBits();
9463 int NumLanes = Bits / 128;
9464 int NumElements = VT.getVectorNumElements();
9465 int NumEltsPerLane = NumElements / NumLanes;
9466 assert(VT.getScalarSizeInBits() <= 32 &&
9467 "Exceeds 32-bit integer zero extension limit");
9468 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9470 // Define a helper function to check a particular ext-scale and lower to it if
9472 auto Lower = [&](int Scale) -> SDValue {
9477 for (int i = 0; i < NumElements; ++i) {
9480 continue; // Valid anywhere but doesn't tell us anything.
9481 if (i % Scale != 0) {
9482 // Each of the extended elements need to be zeroable.
9486 // We no longer are in the anyext case.
9491 // Each of the base elements needs to be consecutive indices into the
9492 // same input vector.
9493 SDValue V = M < NumElements ? V1 : V2;
9494 M = M % NumElements;
9497 Offset = M - (i / Scale);
9498 } else if (InputV != V)
9499 return SDValue(); // Flip-flopping inputs.
9501 // Offset must start in the lowest 128-bit lane or at the start of an
9503 // FIXME: Is it ever worth allowing a negative base offset?
9504 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9505 (Offset % NumEltsPerLane) == 0))
9508 // If we are offsetting, all referenced entries must come from the same
9510 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9513 if ((M % NumElements) != (Offset + (i / Scale)))
9514 return SDValue(); // Non-consecutive strided elements.
9518 // If we fail to find an input, we have a zero-shuffle which should always
9519 // have already been handled.
9520 // FIXME: Maybe handle this here in case during blending we end up with one?
9524 // If we are offsetting, don't extend if we only match a single input, we
9525 // can always do better by using a basic PSHUF or PUNPCK.
9526 if (Offset != 0 && Matches < 2)
9529 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9530 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9533 // The widest scale possible for extending is to a 64-bit integer.
9534 assert(Bits % 64 == 0 &&
9535 "The number of bits in a vector must be divisible by 64 on x86!");
9536 int NumExtElements = Bits / 64;
9538 // Each iteration, try extending the elements half as much, but into twice as
9540 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9541 assert(NumElements % NumExtElements == 0 &&
9542 "The input vector size must be divisible by the extended size.");
9543 if (SDValue V = Lower(NumElements / NumExtElements))
9547 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9551 // Returns one of the source operands if the shuffle can be reduced to a
9552 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9553 auto CanZExtLowHalf = [&]() {
9554 for (int i = NumElements / 2; i != NumElements; ++i)
9557 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9559 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9564 if (SDValue V = CanZExtLowHalf()) {
9565 V = DAG.getBitcast(MVT::v2i64, V);
9566 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9567 return DAG.getBitcast(VT, V);
9570 // No viable ext lowering found.
9574 /// \brief Try to get a scalar value for a specific element of a vector.
9576 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9577 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9578 SelectionDAG &DAG) {
9579 MVT VT = V.getSimpleValueType();
9580 MVT EltVT = VT.getVectorElementType();
9581 V = peekThroughBitcasts(V);
9583 // If the bitcasts shift the element size, we can't extract an equivalent
9585 MVT NewVT = V.getSimpleValueType();
9586 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9589 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9590 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9591 // Ensure the scalar operand is the same size as the destination.
9592 // FIXME: Add support for scalar truncation where possible.
9593 SDValue S = V.getOperand(Idx);
9594 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9595 return DAG.getBitcast(EltVT, S);
9601 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9603 /// This is particularly important because the set of instructions varies
9604 /// significantly based on whether the operand is a load or not.
9605 static bool isShuffleFoldableLoad(SDValue V) {
9606 V = peekThroughBitcasts(V);
9607 return ISD::isNON_EXTLoad(V.getNode());
9610 /// \brief Try to lower insertion of a single element into a zero vector.
9612 /// This is a common pattern that we have especially efficient patterns to lower
9613 /// across all subtarget feature sets.
9614 static SDValue lowerVectorShuffleAsElementInsertion(
9615 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9616 const APInt &Zeroable, const X86Subtarget &Subtarget,
9617 SelectionDAG &DAG) {
9619 MVT EltVT = VT.getVectorElementType();
9622 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9624 bool IsV1Zeroable = true;
9625 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9626 if (i != V2Index && !Zeroable[i]) {
9627 IsV1Zeroable = false;
9631 // Check for a single input from a SCALAR_TO_VECTOR node.
9632 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9633 // all the smarts here sunk into that routine. However, the current
9634 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9635 // vector shuffle lowering is dead.
9636 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9638 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9639 // We need to zext the scalar if it is smaller than an i32.
9640 V2S = DAG.getBitcast(EltVT, V2S);
9641 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9642 // Using zext to expand a narrow element won't work for non-zero
9647 // Zero-extend directly to i32.
9649 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9651 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9652 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9653 EltVT == MVT::i16) {
9654 // Either not inserting from the low element of the input or the input
9655 // element size is too small to use VZEXT_MOVL to clear the high bits.
9659 if (!IsV1Zeroable) {
9660 // If V1 can't be treated as a zero vector we have fewer options to lower
9661 // this. We can't support integer vectors or non-zero targets cheaply, and
9662 // the V1 elements can't be permuted in any way.
9663 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9664 if (!VT.isFloatingPoint() || V2Index != 0)
9666 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9667 V1Mask[V2Index] = -1;
9668 if (!isNoopShuffleMask(V1Mask))
9670 // This is essentially a special case blend operation, but if we have
9671 // general purpose blend operations, they are always faster. Bail and let
9672 // the rest of the lowering handle these as blends.
9673 if (Subtarget.hasSSE41())
9676 // Otherwise, use MOVSD or MOVSS.
9677 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9678 "Only two types of floating point element types to handle!");
9679 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9683 // This lowering only works for the low element with floating point vectors.
9684 if (VT.isFloatingPoint() && V2Index != 0)
9687 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9689 V2 = DAG.getBitcast(VT, V2);
9692 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9693 // the desired position. Otherwise it is more efficient to do a vector
9694 // shift left. We know that we can do a vector shift left because all
9695 // the inputs are zero.
9696 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9697 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9698 V2Shuffle[V2Index] = 0;
9699 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9701 V2 = DAG.getBitcast(MVT::v16i8, V2);
9703 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9704 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9705 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9706 DAG.getDataLayout(), VT)));
9707 V2 = DAG.getBitcast(VT, V2);
9713 /// Try to lower broadcast of a single - truncated - integer element,
9714 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9716 /// This assumes we have AVX2.
9717 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9718 SDValue V0, int BroadcastIdx,
9719 const X86Subtarget &Subtarget,
9720 SelectionDAG &DAG) {
9721 assert(Subtarget.hasAVX2() &&
9722 "We can only lower integer broadcasts with AVX2!");
9724 EVT EltVT = VT.getVectorElementType();
9725 EVT V0VT = V0.getValueType();
9727 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9728 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9730 EVT V0EltVT = V0VT.getVectorElementType();
9731 if (!V0EltVT.isInteger())
9734 const unsigned EltSize = EltVT.getSizeInBits();
9735 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9737 // This is only a truncation if the original element type is larger.
9738 if (V0EltSize <= EltSize)
9741 assert(((V0EltSize % EltSize) == 0) &&
9742 "Scalar type sizes must all be powers of 2 on x86!");
9744 const unsigned V0Opc = V0.getOpcode();
9745 const unsigned Scale = V0EltSize / EltSize;
9746 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9748 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9749 V0Opc != ISD::BUILD_VECTOR)
9752 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9754 // If we're extracting non-least-significant bits, shift so we can truncate.
9755 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9756 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9757 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9758 if (const int OffsetIdx = BroadcastIdx % Scale)
9759 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9760 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9762 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9763 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9766 /// \brief Try to lower broadcast of a single element.
9768 /// For convenience, this code also bundles all of the subtarget feature set
9769 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9770 /// a convenient way to factor it out.
9771 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9772 SDValue V1, SDValue V2,
9774 const X86Subtarget &Subtarget,
9775 SelectionDAG &DAG) {
9776 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9777 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9778 (Subtarget.hasAVX2() && VT.isInteger())))
9781 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9782 // we can only broadcast from a register with AVX2.
9783 unsigned NumElts = Mask.size();
9784 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9785 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9787 // Check that the mask is a broadcast.
9788 int BroadcastIdx = -1;
9789 for (int i = 0; i != (int)NumElts; ++i) {
9790 SmallVector<int, 8> BroadcastMask(NumElts, i);
9791 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9797 if (BroadcastIdx < 0)
9799 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9800 "a sorted mask where the broadcast "
9803 // Go up the chain of (vector) values to find a scalar load that we can
9804 // combine with the broadcast.
9807 switch (V.getOpcode()) {
9808 case ISD::BITCAST: {
9809 SDValue VSrc = V.getOperand(0);
9810 MVT SrcVT = VSrc.getSimpleValueType();
9811 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9816 case ISD::CONCAT_VECTORS: {
9817 int OperandSize = Mask.size() / V.getNumOperands();
9818 V = V.getOperand(BroadcastIdx / OperandSize);
9819 BroadcastIdx %= OperandSize;
9822 case ISD::INSERT_SUBVECTOR: {
9823 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9824 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9828 int BeginIdx = (int)ConstantIdx->getZExtValue();
9830 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9831 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9832 BroadcastIdx -= BeginIdx;
9843 // Check if this is a broadcast of a scalar. We special case lowering
9844 // for scalars so that we can more effectively fold with loads.
9845 // First, look through bitcast: if the original value has a larger element
9846 // type than the shuffle, the broadcast element is in essence truncated.
9847 // Make that explicit to ease folding.
9848 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9849 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9850 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9851 return TruncBroadcast;
9853 MVT BroadcastVT = VT;
9855 // Peek through any bitcast (only useful for loads).
9856 SDValue BC = peekThroughBitcasts(V);
9858 // Also check the simpler case, where we can directly reuse the scalar.
9859 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9860 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9861 V = V.getOperand(BroadcastIdx);
9863 // If we can't broadcast from a register, check that the input is a load.
9864 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9866 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9867 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9868 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9869 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9870 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9873 // If we are broadcasting a load that is only used by the shuffle
9874 // then we can reduce the vector load to the broadcasted scalar load.
9875 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9876 SDValue BaseAddr = Ld->getOperand(1);
9877 EVT SVT = BroadcastVT.getScalarType();
9878 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9879 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9880 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9881 DAG.getMachineFunction().getMachineMemOperand(
9882 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9884 // Make sure the newly-created LOAD is in the same position as Ld in
9885 // terms of dependency. We create a TokenFactor for Ld and V,
9886 // and update uses of Ld's output chain to use the TokenFactor.
9887 if (Ld->hasAnyUseOfValue(1)) {
9888 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9889 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9890 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9891 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9892 SDValue(V.getNode(), 1));
9894 } else if (!BroadcastFromReg) {
9895 // We can't broadcast from a vector register.
9897 } else if (BroadcastIdx != 0) {
9898 // We can only broadcast from the zero-element of a vector register,
9899 // but it can be advantageous to broadcast from the zero-element of a
9901 if (!VT.is256BitVector() && !VT.is512BitVector())
9904 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9905 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9908 // Only broadcast the zero-element of a 128-bit subvector.
9909 unsigned EltSize = VT.getScalarSizeInBits();
9910 if (((BroadcastIdx * EltSize) % 128) != 0)
9913 // The shuffle input might have been a bitcast we looked through; look at
9914 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9915 // later bitcast it to BroadcastVT.
9916 MVT SrcVT = V.getSimpleValueType();
9917 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9918 "Unexpected vector element size");
9919 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9920 "Unexpected vector size");
9922 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9923 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9924 DAG.getIntPtrConstant(BroadcastIdx, DL));
9927 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9928 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9929 DAG.getBitcast(MVT::f64, V));
9931 // Bitcast back to the same scalar type as BroadcastVT.
9932 MVT SrcVT = V.getSimpleValueType();
9933 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9934 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9935 "Unexpected vector element size");
9936 if (SrcVT.isVector()) {
9937 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9938 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9940 SrcVT = BroadcastVT.getScalarType();
9942 V = DAG.getBitcast(SrcVT, V);
9945 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9946 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9947 V = DAG.getBitcast(MVT::f64, V);
9948 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9949 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9952 // We only support broadcasting from 128-bit vectors to minimize the
9953 // number of patterns we need to deal with in isel. So extract down to
9955 if (SrcVT.getSizeInBits() > 128)
9956 V = extract128BitVector(V, 0, DAG, DL);
9958 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9961 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9962 // INSERTPS when the V1 elements are already in the correct locations
9963 // because otherwise we can just always use two SHUFPS instructions which
9964 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9965 // perform INSERTPS if a single V1 element is out of place and all V2
9966 // elements are zeroable.
9967 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9968 unsigned &InsertPSMask,
9969 const APInt &Zeroable,
9971 SelectionDAG &DAG) {
9972 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9973 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9974 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9976 // Attempt to match INSERTPS with one element from VA or VB being
9977 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9979 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9980 ArrayRef<int> CandidateMask) {
9982 int VADstIndex = -1;
9983 int VBDstIndex = -1;
9984 bool VAUsedInPlace = false;
9986 for (int i = 0; i < 4; ++i) {
9987 // Synthesize a zero mask from the zeroable elements (includes undefs).
9993 // Flag if we use any VA inputs in place.
9994 if (i == CandidateMask[i]) {
9995 VAUsedInPlace = true;
9999 // We can only insert a single non-zeroable element.
10000 if (VADstIndex >= 0 || VBDstIndex >= 0)
10003 if (CandidateMask[i] < 4) {
10004 // VA input out of place for insertion.
10007 // VB input for insertion.
10012 // Don't bother if we have no (non-zeroable) element for insertion.
10013 if (VADstIndex < 0 && VBDstIndex < 0)
10016 // Determine element insertion src/dst indices. The src index is from the
10017 // start of the inserted vector, not the start of the concatenated vector.
10018 unsigned VBSrcIndex = 0;
10019 if (VADstIndex >= 0) {
10020 // If we have a VA input out of place, we use VA as the V2 element
10021 // insertion and don't use the original V2 at all.
10022 VBSrcIndex = CandidateMask[VADstIndex];
10023 VBDstIndex = VADstIndex;
10026 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10029 // If no V1 inputs are used in place, then the result is created only from
10030 // the zero mask and the V2 insertion - so remove V1 dependency.
10031 if (!VAUsedInPlace)
10032 VA = DAG.getUNDEF(MVT::v4f32);
10034 // Update V1, V2 and InsertPSMask accordingly.
10038 // Insert the V2 element into the desired position.
10039 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10040 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10044 if (matchAsInsertPS(V1, V2, Mask))
10047 // Commute and try again.
10048 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10049 ShuffleVectorSDNode::commuteMask(CommutedMask);
10050 if (matchAsInsertPS(V2, V1, CommutedMask))
10056 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10057 SDValue V2, ArrayRef<int> Mask,
10058 const APInt &Zeroable,
10059 SelectionDAG &DAG) {
10060 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10061 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10063 // Attempt to match the insertps pattern.
10064 unsigned InsertPSMask;
10065 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10068 // Insert the V2 element into the desired position.
10069 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10070 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10073 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10074 /// UNPCK instruction.
10076 /// This specifically targets cases where we end up with alternating between
10077 /// the two inputs, and so can permute them into something that feeds a single
10078 /// UNPCK instruction. Note that this routine only targets integer vectors
10079 /// because for floating point vectors we have a generalized SHUFPS lowering
10080 /// strategy that handles everything that doesn't *exactly* match an unpack,
10081 /// making this clever lowering unnecessary.
10082 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10083 SDValue V1, SDValue V2,
10084 ArrayRef<int> Mask,
10085 SelectionDAG &DAG) {
10086 assert(!VT.isFloatingPoint() &&
10087 "This routine only supports integer vectors.");
10088 assert(VT.is128BitVector() &&
10089 "This routine only works on 128-bit vectors.");
10090 assert(!V2.isUndef() &&
10091 "This routine should only be used when blending two inputs.");
10092 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10094 int Size = Mask.size();
10097 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10099 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10101 bool UnpackLo = NumLoInputs >= NumHiInputs;
10103 auto TryUnpack = [&](int ScalarSize, int Scale) {
10104 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10105 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10107 for (int i = 0; i < Size; ++i) {
10111 // Each element of the unpack contains Scale elements from this mask.
10112 int UnpackIdx = i / Scale;
10114 // We only handle the case where V1 feeds the first slots of the unpack.
10115 // We rely on canonicalization to ensure this is the case.
10116 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10119 // Setup the mask for this input. The indexing is tricky as we have to
10120 // handle the unpack stride.
10121 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10122 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10126 // If we will have to shuffle both inputs to use the unpack, check whether
10127 // we can just unpack first and shuffle the result. If so, skip this unpack.
10128 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10129 !isNoopShuffleMask(V2Mask))
10132 // Shuffle the inputs into place.
10133 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10134 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10136 // Cast the inputs to the type we will use to unpack them.
10137 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10138 V1 = DAG.getBitcast(UnpackVT, V1);
10139 V2 = DAG.getBitcast(UnpackVT, V2);
10141 // Unpack the inputs and cast the result back to the desired type.
10142 return DAG.getBitcast(
10143 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10144 UnpackVT, V1, V2));
10147 // We try each unpack from the largest to the smallest to try and find one
10148 // that fits this mask.
10149 int OrigScalarSize = VT.getScalarSizeInBits();
10150 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10151 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10154 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10156 if (NumLoInputs == 0 || NumHiInputs == 0) {
10157 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10158 "We have to have *some* inputs!");
10159 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10161 // FIXME: We could consider the total complexity of the permute of each
10162 // possible unpacking. Or at the least we should consider how many
10163 // half-crossings are created.
10164 // FIXME: We could consider commuting the unpacks.
10166 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10167 for (int i = 0; i < Size; ++i) {
10171 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10174 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10176 return DAG.getVectorShuffle(
10177 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10179 DAG.getUNDEF(VT), PermMask);
10185 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10187 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10188 /// support for floating point shuffles but not integer shuffles. These
10189 /// instructions will incur a domain crossing penalty on some chips though so
10190 /// it is better to avoid lowering through this for integer vectors where
10192 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10193 const APInt &Zeroable,
10194 SDValue V1, SDValue V2,
10195 const X86Subtarget &Subtarget,
10196 SelectionDAG &DAG) {
10197 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10198 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10199 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10201 if (V2.isUndef()) {
10202 // Check for being able to broadcast a single element.
10203 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10204 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10207 // Straight shuffle of a single input vector. Simulate this by using the
10208 // single input as both of the "inputs" to this instruction..
10209 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10211 if (Subtarget.hasAVX()) {
10212 // If we have AVX, we can use VPERMILPS which will allow folding a load
10213 // into the shuffle.
10214 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10215 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10218 return DAG.getNode(
10219 X86ISD::SHUFP, DL, MVT::v2f64,
10220 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10221 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10222 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10224 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10225 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10227 // If we have a single input, insert that into V1 if we can do so cheaply.
10228 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10229 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10230 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10232 // Try inverting the insertion since for v2 masks it is easy to do and we
10233 // can't reliably sort the mask one way or the other.
10234 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10235 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10236 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10237 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10241 // Try to use one of the special instruction patterns to handle two common
10242 // blend patterns if a zero-blend above didn't work.
10243 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10244 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10245 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10246 // We can either use a special instruction to load over the low double or
10247 // to move just the low double.
10248 return DAG.getNode(
10249 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10250 DL, MVT::v2f64, V2,
10251 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10253 if (Subtarget.hasSSE41())
10254 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10255 Zeroable, Subtarget, DAG))
10258 // Use dedicated unpack instructions for masks that match their pattern.
10260 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10263 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10264 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10265 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10268 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10270 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10271 /// the integer unit to minimize domain crossing penalties. However, for blends
10272 /// it falls back to the floating point shuffle operation with appropriate bit
10274 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10275 const APInt &Zeroable,
10276 SDValue V1, SDValue V2,
10277 const X86Subtarget &Subtarget,
10278 SelectionDAG &DAG) {
10279 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10280 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10281 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10283 if (V2.isUndef()) {
10284 // Check for being able to broadcast a single element.
10285 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10286 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10289 // Straight shuffle of a single input vector. For everything from SSE2
10290 // onward this has a single fast instruction with no scary immediates.
10291 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10292 V1 = DAG.getBitcast(MVT::v4i32, V1);
10293 int WidenedMask[4] = {
10294 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10295 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10296 return DAG.getBitcast(
10298 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10299 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10301 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10302 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10303 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10304 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10306 // If we have a blend of two same-type PACKUS operations and the blend aligns
10307 // with the low and high halves, we can just merge the PACKUS operations.
10308 // This is particularly important as it lets us merge shuffles that this
10309 // routine itself creates.
10310 auto GetPackNode = [](SDValue V) {
10311 V = peekThroughBitcasts(V);
10312 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10314 if (SDValue V1Pack = GetPackNode(V1))
10315 if (SDValue V2Pack = GetPackNode(V2)) {
10316 EVT PackVT = V1Pack.getValueType();
10317 if (PackVT == V2Pack.getValueType())
10318 return DAG.getBitcast(MVT::v2i64,
10319 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10320 Mask[0] == 0 ? V1Pack.getOperand(0)
10321 : V1Pack.getOperand(1),
10322 Mask[1] == 2 ? V2Pack.getOperand(0)
10323 : V2Pack.getOperand(1)));
10326 // Try to use shift instructions.
10327 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10328 Zeroable, Subtarget, DAG))
10331 // When loading a scalar and then shuffling it into a vector we can often do
10332 // the insertion cheaply.
10333 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10334 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10336 // Try inverting the insertion since for v2 masks it is easy to do and we
10337 // can't reliably sort the mask one way or the other.
10338 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10339 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10340 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10343 // We have different paths for blend lowering, but they all must use the
10344 // *exact* same predicate.
10345 bool IsBlendSupported = Subtarget.hasSSE41();
10346 if (IsBlendSupported)
10347 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10348 Zeroable, Subtarget, DAG))
10351 // Use dedicated unpack instructions for masks that match their pattern.
10353 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10356 // Try to use byte rotation instructions.
10357 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10358 if (Subtarget.hasSSSE3())
10359 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10360 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10363 // If we have direct support for blends, we should lower by decomposing into
10364 // a permute. That will be faster than the domain cross.
10365 if (IsBlendSupported)
10366 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10369 // We implement this with SHUFPD which is pretty lame because it will likely
10370 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10371 // However, all the alternatives are still more cycles and newer chips don't
10372 // have this problem. It would be really nice if x86 had better shuffles here.
10373 V1 = DAG.getBitcast(MVT::v2f64, V1);
10374 V2 = DAG.getBitcast(MVT::v2f64, V2);
10375 return DAG.getBitcast(MVT::v2i64,
10376 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10379 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10381 /// This is used to disable more specialized lowerings when the shufps lowering
10382 /// will happen to be efficient.
10383 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10384 // This routine only handles 128-bit shufps.
10385 assert(Mask.size() == 4 && "Unsupported mask size!");
10386 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10387 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10388 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10389 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10391 // To lower with a single SHUFPS we need to have the low half and high half
10392 // each requiring a single input.
10393 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10395 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10401 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10403 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10404 /// It makes no assumptions about whether this is the *best* lowering, it simply
10406 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10407 ArrayRef<int> Mask, SDValue V1,
10408 SDValue V2, SelectionDAG &DAG) {
10409 SDValue LowV = V1, HighV = V2;
10410 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10412 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10414 if (NumV2Elements == 1) {
10415 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10417 // Compute the index adjacent to V2Index and in the same half by toggling
10419 int V2AdjIndex = V2Index ^ 1;
10421 if (Mask[V2AdjIndex] < 0) {
10422 // Handles all the cases where we have a single V2 element and an undef.
10423 // This will only ever happen in the high lanes because we commute the
10424 // vector otherwise.
10426 std::swap(LowV, HighV);
10427 NewMask[V2Index] -= 4;
10429 // Handle the case where the V2 element ends up adjacent to a V1 element.
10430 // To make this work, blend them together as the first step.
10431 int V1Index = V2AdjIndex;
10432 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10433 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10434 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10436 // Now proceed to reconstruct the final blend as we have the necessary
10437 // high or low half formed.
10444 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10445 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10447 } else if (NumV2Elements == 2) {
10448 if (Mask[0] < 4 && Mask[1] < 4) {
10449 // Handle the easy case where we have V1 in the low lanes and V2 in the
10453 } else if (Mask[2] < 4 && Mask[3] < 4) {
10454 // We also handle the reversed case because this utility may get called
10455 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10456 // arrange things in the right direction.
10462 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10463 // trying to place elements directly, just blend them and set up the final
10464 // shuffle to place them.
10466 // The first two blend mask elements are for V1, the second two are for
10468 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10469 Mask[2] < 4 ? Mask[2] : Mask[3],
10470 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10471 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10472 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10473 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10475 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10478 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10479 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10480 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10481 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10484 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10485 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10488 /// \brief Lower 4-lane 32-bit floating point shuffles.
10490 /// Uses instructions exclusively from the floating point unit to minimize
10491 /// domain crossing penalties, as these are sufficient to implement all v4f32
10493 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10494 const APInt &Zeroable,
10495 SDValue V1, SDValue V2,
10496 const X86Subtarget &Subtarget,
10497 SelectionDAG &DAG) {
10498 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10499 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10500 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10502 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10504 if (NumV2Elements == 0) {
10505 // Check for being able to broadcast a single element.
10506 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10507 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10510 // Use even/odd duplicate instructions for masks that match their pattern.
10511 if (Subtarget.hasSSE3()) {
10512 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10513 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10514 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10515 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10518 if (Subtarget.hasAVX()) {
10519 // If we have AVX, we can use VPERMILPS which will allow folding a load
10520 // into the shuffle.
10521 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10522 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10525 // Otherwise, use a straight shuffle of a single input vector. We pass the
10526 // input vector to both operands to simulate this with a SHUFPS.
10527 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10528 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10531 // There are special ways we can lower some single-element blends. However, we
10532 // have custom ways we can lower more complex single-element blends below that
10533 // we defer to if both this and BLENDPS fail to match, so restrict this to
10534 // when the V2 input is targeting element 0 of the mask -- that is the fast
10536 if (NumV2Elements == 1 && Mask[0] >= 4)
10537 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10538 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10541 if (Subtarget.hasSSE41()) {
10542 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10543 Zeroable, Subtarget, DAG))
10546 // Use INSERTPS if we can complete the shuffle efficiently.
10548 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10551 if (!isSingleSHUFPSMask(Mask))
10552 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10553 DL, MVT::v4f32, V1, V2, Mask, DAG))
10557 // Use low/high mov instructions.
10558 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10559 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10560 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10561 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10563 // Use dedicated unpack instructions for masks that match their pattern.
10565 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10568 // Otherwise fall back to a SHUFPS lowering strategy.
10569 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10572 /// \brief Lower 4-lane i32 vector shuffles.
10574 /// We try to handle these with integer-domain shuffles where we can, but for
10575 /// blends we use the floating point domain blend instructions.
10576 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10577 const APInt &Zeroable,
10578 SDValue V1, SDValue V2,
10579 const X86Subtarget &Subtarget,
10580 SelectionDAG &DAG) {
10581 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10582 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10583 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10585 // Whenever we can lower this as a zext, that instruction is strictly faster
10586 // than any alternative. It also allows us to fold memory operands into the
10587 // shuffle in many cases.
10588 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10589 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10592 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10594 if (NumV2Elements == 0) {
10595 // Check for being able to broadcast a single element.
10596 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10597 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10600 // Straight shuffle of a single input vector. For everything from SSE2
10601 // onward this has a single fast instruction with no scary immediates.
10602 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10603 // but we aren't actually going to use the UNPCK instruction because doing
10604 // so prevents folding a load into this instruction or making a copy.
10605 const int UnpackLoMask[] = {0, 0, 1, 1};
10606 const int UnpackHiMask[] = {2, 2, 3, 3};
10607 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10608 Mask = UnpackLoMask;
10609 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10610 Mask = UnpackHiMask;
10612 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10613 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10616 // Try to use shift instructions.
10617 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10618 Zeroable, Subtarget, DAG))
10621 // There are special ways we can lower some single-element blends.
10622 if (NumV2Elements == 1)
10623 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10624 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10627 // We have different paths for blend lowering, but they all must use the
10628 // *exact* same predicate.
10629 bool IsBlendSupported = Subtarget.hasSSE41();
10630 if (IsBlendSupported)
10631 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10632 Zeroable, Subtarget, DAG))
10635 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10639 // Use dedicated unpack instructions for masks that match their pattern.
10641 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10644 // Try to use byte rotation instructions.
10645 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10646 if (Subtarget.hasSSSE3())
10647 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10648 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10651 // Assume that a single SHUFPS is faster than an alternative sequence of
10652 // multiple instructions (even if the CPU has a domain penalty).
10653 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10654 if (!isSingleSHUFPSMask(Mask)) {
10655 // If we have direct support for blends, we should lower by decomposing into
10656 // a permute. That will be faster than the domain cross.
10657 if (IsBlendSupported)
10658 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10661 // Try to lower by permuting the inputs into an unpack instruction.
10662 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10663 DL, MVT::v4i32, V1, V2, Mask, DAG))
10667 // We implement this with SHUFPS because it can blend from two vectors.
10668 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10669 // up the inputs, bypassing domain shift penalties that we would incur if we
10670 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10672 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10673 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10674 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10675 return DAG.getBitcast(MVT::v4i32, ShufPS);
10678 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10679 /// shuffle lowering, and the most complex part.
10681 /// The lowering strategy is to try to form pairs of input lanes which are
10682 /// targeted at the same half of the final vector, and then use a dword shuffle
10683 /// to place them onto the right half, and finally unpack the paired lanes into
10684 /// their final position.
10686 /// The exact breakdown of how to form these dword pairs and align them on the
10687 /// correct sides is really tricky. See the comments within the function for
10688 /// more of the details.
10690 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10691 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10692 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10693 /// vector, form the analogous 128-bit 8-element Mask.
10694 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10695 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10696 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10697 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10698 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10700 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10701 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10702 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10704 SmallVector<int, 4> LoInputs;
10705 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10706 std::sort(LoInputs.begin(), LoInputs.end());
10707 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10708 SmallVector<int, 4> HiInputs;
10709 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10710 std::sort(HiInputs.begin(), HiInputs.end());
10711 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10713 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10714 int NumHToL = LoInputs.size() - NumLToL;
10716 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10717 int NumHToH = HiInputs.size() - NumLToH;
10718 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10719 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10720 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10721 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10723 // If we are splatting two values from one half - one to each half, then
10724 // we can shuffle that half so each is splatted to a dword, then splat those
10725 // to their respective halves.
10726 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10728 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10729 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10730 V = DAG.getNode(ShufWOp, DL, VT, V,
10731 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10732 V = DAG.getBitcast(PSHUFDVT, V);
10733 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10734 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10735 return DAG.getBitcast(VT, V);
10738 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10739 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10740 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10741 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10743 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10744 // such inputs we can swap two of the dwords across the half mark and end up
10745 // with <=2 inputs to each half in each half. Once there, we can fall through
10746 // to the generic code below. For example:
10748 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10749 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10751 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10752 // and an existing 2-into-2 on the other half. In this case we may have to
10753 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10754 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10755 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10756 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10757 // half than the one we target for fixing) will be fixed when we re-enter this
10758 // path. We will also combine away any sequence of PSHUFD instructions that
10759 // result into a single instruction. Here is an example of the tricky case:
10761 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10762 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10764 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10766 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10767 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10769 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10770 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10772 // The result is fine to be handled by the generic logic.
10773 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10774 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10775 int AOffset, int BOffset) {
10776 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10777 "Must call this with A having 3 or 1 inputs from the A half.");
10778 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10779 "Must call this with B having 1 or 3 inputs from the B half.");
10780 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10781 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10783 bool ThreeAInputs = AToAInputs.size() == 3;
10785 // Compute the index of dword with only one word among the three inputs in
10786 // a half by taking the sum of the half with three inputs and subtracting
10787 // the sum of the actual three inputs. The difference is the remaining
10789 int ADWord, BDWord;
10790 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10791 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10792 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10793 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10794 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10795 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10796 int TripleNonInputIdx =
10797 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10798 TripleDWord = TripleNonInputIdx / 2;
10800 // We use xor with one to compute the adjacent DWord to whichever one the
10802 OneInputDWord = (OneInput / 2) ^ 1;
10804 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10805 // and BToA inputs. If there is also such a problem with the BToB and AToB
10806 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10807 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10808 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10809 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10810 // Compute how many inputs will be flipped by swapping these DWords. We
10812 // to balance this to ensure we don't form a 3-1 shuffle in the other
10814 int NumFlippedAToBInputs =
10815 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10816 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10817 int NumFlippedBToBInputs =
10818 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10819 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10820 if ((NumFlippedAToBInputs == 1 &&
10821 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10822 (NumFlippedBToBInputs == 1 &&
10823 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10824 // We choose whether to fix the A half or B half based on whether that
10825 // half has zero flipped inputs. At zero, we may not be able to fix it
10826 // with that half. We also bias towards fixing the B half because that
10827 // will more commonly be the high half, and we have to bias one way.
10828 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10829 ArrayRef<int> Inputs) {
10830 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10831 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10832 // Determine whether the free index is in the flipped dword or the
10833 // unflipped dword based on where the pinned index is. We use this bit
10834 // in an xor to conditionally select the adjacent dword.
10835 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10836 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10837 if (IsFixIdxInput == IsFixFreeIdxInput)
10839 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10840 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10841 "We need to be changing the number of flipped inputs!");
10842 int PSHUFHalfMask[] = {0, 1, 2, 3};
10843 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10844 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10846 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10848 for (int &M : Mask)
10849 if (M >= 0 && M == FixIdx)
10851 else if (M >= 0 && M == FixFreeIdx)
10854 if (NumFlippedBToBInputs != 0) {
10856 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10857 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10859 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10860 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10861 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10866 int PSHUFDMask[] = {0, 1, 2, 3};
10867 PSHUFDMask[ADWord] = BDWord;
10868 PSHUFDMask[BDWord] = ADWord;
10869 V = DAG.getBitcast(
10871 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10872 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10874 // Adjust the mask to match the new locations of A and B.
10875 for (int &M : Mask)
10876 if (M >= 0 && M/2 == ADWord)
10877 M = 2 * BDWord + M % 2;
10878 else if (M >= 0 && M/2 == BDWord)
10879 M = 2 * ADWord + M % 2;
10881 // Recurse back into this routine to re-compute state now that this isn't
10882 // a 3 and 1 problem.
10883 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10886 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10887 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10888 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10889 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10891 // At this point there are at most two inputs to the low and high halves from
10892 // each half. That means the inputs can always be grouped into dwords and
10893 // those dwords can then be moved to the correct half with a dword shuffle.
10894 // We use at most one low and one high word shuffle to collect these paired
10895 // inputs into dwords, and finally a dword shuffle to place them.
10896 int PSHUFLMask[4] = {-1, -1, -1, -1};
10897 int PSHUFHMask[4] = {-1, -1, -1, -1};
10898 int PSHUFDMask[4] = {-1, -1, -1, -1};
10900 // First fix the masks for all the inputs that are staying in their
10901 // original halves. This will then dictate the targets of the cross-half
10903 auto fixInPlaceInputs =
10904 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10905 MutableArrayRef<int> SourceHalfMask,
10906 MutableArrayRef<int> HalfMask, int HalfOffset) {
10907 if (InPlaceInputs.empty())
10909 if (InPlaceInputs.size() == 1) {
10910 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10911 InPlaceInputs[0] - HalfOffset;
10912 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10915 if (IncomingInputs.empty()) {
10916 // Just fix all of the in place inputs.
10917 for (int Input : InPlaceInputs) {
10918 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10919 PSHUFDMask[Input / 2] = Input / 2;
10924 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10925 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10926 InPlaceInputs[0] - HalfOffset;
10927 // Put the second input next to the first so that they are packed into
10928 // a dword. We find the adjacent index by toggling the low bit.
10929 int AdjIndex = InPlaceInputs[0] ^ 1;
10930 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10931 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10932 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10934 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10935 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10937 // Now gather the cross-half inputs and place them into a free dword of
10938 // their target half.
10939 // FIXME: This operation could almost certainly be simplified dramatically to
10940 // look more like the 3-1 fixing operation.
10941 auto moveInputsToRightHalf = [&PSHUFDMask](
10942 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10943 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10944 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10946 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10947 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10949 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10951 int LowWord = Word & ~1;
10952 int HighWord = Word | 1;
10953 return isWordClobbered(SourceHalfMask, LowWord) ||
10954 isWordClobbered(SourceHalfMask, HighWord);
10957 if (IncomingInputs.empty())
10960 if (ExistingInputs.empty()) {
10961 // Map any dwords with inputs from them into the right half.
10962 for (int Input : IncomingInputs) {
10963 // If the source half mask maps over the inputs, turn those into
10964 // swaps and use the swapped lane.
10965 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10966 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10967 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10968 Input - SourceOffset;
10969 // We have to swap the uses in our half mask in one sweep.
10970 for (int &M : HalfMask)
10971 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10973 else if (M == Input)
10974 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10976 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10977 Input - SourceOffset &&
10978 "Previous placement doesn't match!");
10980 // Note that this correctly re-maps both when we do a swap and when
10981 // we observe the other side of the swap above. We rely on that to
10982 // avoid swapping the members of the input list directly.
10983 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10986 // Map the input's dword into the correct half.
10987 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10988 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10990 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10992 "Previous placement doesn't match!");
10995 // And just directly shift any other-half mask elements to be same-half
10996 // as we will have mirrored the dword containing the element into the
10997 // same position within that half.
10998 for (int &M : HalfMask)
10999 if (M >= SourceOffset && M < SourceOffset + 4) {
11000 M = M - SourceOffset + DestOffset;
11001 assert(M >= 0 && "This should never wrap below zero!");
11006 // Ensure we have the input in a viable dword of its current half. This
11007 // is particularly tricky because the original position may be clobbered
11008 // by inputs being moved and *staying* in that half.
11009 if (IncomingInputs.size() == 1) {
11010 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11011 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11013 SourceHalfMask[InputFixed - SourceOffset] =
11014 IncomingInputs[0] - SourceOffset;
11015 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11017 IncomingInputs[0] = InputFixed;
11019 } else if (IncomingInputs.size() == 2) {
11020 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11021 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11022 // We have two non-adjacent or clobbered inputs we need to extract from
11023 // the source half. To do this, we need to map them into some adjacent
11024 // dword slot in the source mask.
11025 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11026 IncomingInputs[1] - SourceOffset};
11028 // If there is a free slot in the source half mask adjacent to one of
11029 // the inputs, place the other input in it. We use (Index XOR 1) to
11030 // compute an adjacent index.
11031 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11032 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11033 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11034 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11035 InputsFixed[1] = InputsFixed[0] ^ 1;
11036 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11037 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11038 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11039 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11040 InputsFixed[0] = InputsFixed[1] ^ 1;
11041 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11042 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11043 // The two inputs are in the same DWord but it is clobbered and the
11044 // adjacent DWord isn't used at all. Move both inputs to the free
11046 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11047 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11048 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11049 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11051 // The only way we hit this point is if there is no clobbering
11052 // (because there are no off-half inputs to this half) and there is no
11053 // free slot adjacent to one of the inputs. In this case, we have to
11054 // swap an input with a non-input.
11055 for (int i = 0; i < 4; ++i)
11056 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11057 "We can't handle any clobbers here!");
11058 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11059 "Cannot have adjacent inputs here!");
11061 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11062 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11064 // We also have to update the final source mask in this case because
11065 // it may need to undo the above swap.
11066 for (int &M : FinalSourceHalfMask)
11067 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11068 M = InputsFixed[1] + SourceOffset;
11069 else if (M == InputsFixed[1] + SourceOffset)
11070 M = (InputsFixed[0] ^ 1) + SourceOffset;
11072 InputsFixed[1] = InputsFixed[0] ^ 1;
11075 // Point everything at the fixed inputs.
11076 for (int &M : HalfMask)
11077 if (M == IncomingInputs[0])
11078 M = InputsFixed[0] + SourceOffset;
11079 else if (M == IncomingInputs[1])
11080 M = InputsFixed[1] + SourceOffset;
11082 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11083 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11086 llvm_unreachable("Unhandled input size!");
11089 // Now hoist the DWord down to the right half.
11090 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11091 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11092 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11093 for (int &M : HalfMask)
11094 for (int Input : IncomingInputs)
11096 M = FreeDWord * 2 + Input % 2;
11098 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11099 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11100 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11101 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11103 // Now enact all the shuffles we've computed to move the inputs into their
11105 if (!isNoopShuffleMask(PSHUFLMask))
11106 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11107 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11108 if (!isNoopShuffleMask(PSHUFHMask))
11109 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11110 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11111 if (!isNoopShuffleMask(PSHUFDMask))
11112 V = DAG.getBitcast(
11114 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11115 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11117 // At this point, each half should contain all its inputs, and we can then
11118 // just shuffle them into their final position.
11119 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11120 "Failed to lift all the high half inputs to the low mask!");
11121 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11122 "Failed to lift all the low half inputs to the high mask!");
11124 // Do a half shuffle for the low mask.
11125 if (!isNoopShuffleMask(LoMask))
11126 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11127 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11129 // Do a half shuffle with the high mask after shifting its values down.
11130 for (int &M : HiMask)
11133 if (!isNoopShuffleMask(HiMask))
11134 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11135 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11140 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11141 /// blend if only one input is used.
11142 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11143 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11144 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11146 SDValue V1Mask[16];
11147 SDValue V2Mask[16];
11151 int Size = Mask.size();
11152 int Scale = 16 / Size;
11153 for (int i = 0; i < 16; ++i) {
11154 if (Mask[i / Scale] < 0) {
11155 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11157 const int ZeroMask = 0x80;
11158 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11160 int V2Idx = Mask[i / Scale] < Size
11162 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11163 if (Zeroable[i / Scale])
11164 V1Idx = V2Idx = ZeroMask;
11165 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11166 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11167 V1InUse |= (ZeroMask != V1Idx);
11168 V2InUse |= (ZeroMask != V2Idx);
11173 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11174 DAG.getBitcast(MVT::v16i8, V1),
11175 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11177 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11178 DAG.getBitcast(MVT::v16i8, V2),
11179 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11181 // If we need shuffled inputs from both, blend the two.
11183 if (V1InUse && V2InUse)
11184 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11186 V = V1InUse ? V1 : V2;
11188 // Cast the result back to the correct type.
11189 return DAG.getBitcast(VT, V);
11192 /// \brief Generic lowering of 8-lane i16 shuffles.
11194 /// This handles both single-input shuffles and combined shuffle/blends with
11195 /// two inputs. The single input shuffles are immediately delegated to
11196 /// a dedicated lowering routine.
11198 /// The blends are lowered in one of three fundamental ways. If there are few
11199 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11200 /// of the input is significantly cheaper when lowered as an interleaving of
11201 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11202 /// halves of the inputs separately (making them have relatively few inputs)
11203 /// and then concatenate them.
11204 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11205 const APInt &Zeroable,
11206 SDValue V1, SDValue V2,
11207 const X86Subtarget &Subtarget,
11208 SelectionDAG &DAG) {
11209 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11210 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11211 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11213 // Whenever we can lower this as a zext, that instruction is strictly faster
11214 // than any alternative.
11215 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11216 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11219 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11221 if (NumV2Inputs == 0) {
11222 // Check for being able to broadcast a single element.
11223 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11224 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11227 // Try to use shift instructions.
11228 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11229 Zeroable, Subtarget, DAG))
11232 // Use dedicated unpack instructions for masks that match their pattern.
11234 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11237 // Try to use byte rotation instructions.
11238 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11239 Mask, Subtarget, DAG))
11242 // Make a copy of the mask so it can be modified.
11243 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11244 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11245 MutableMask, Subtarget,
11249 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11250 "All single-input shuffles should be canonicalized to be V1-input "
11253 // Try to use shift instructions.
11254 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11255 Zeroable, Subtarget, DAG))
11258 // See if we can use SSE4A Extraction / Insertion.
11259 if (Subtarget.hasSSE4A())
11260 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11264 // There are special ways we can lower some single-element blends.
11265 if (NumV2Inputs == 1)
11266 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11267 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11270 // We have different paths for blend lowering, but they all must use the
11271 // *exact* same predicate.
11272 bool IsBlendSupported = Subtarget.hasSSE41();
11273 if (IsBlendSupported)
11274 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11275 Zeroable, Subtarget, DAG))
11278 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11282 // Use dedicated unpack instructions for masks that match their pattern.
11284 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11287 // Try to use byte rotation instructions.
11288 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11289 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11292 if (SDValue BitBlend =
11293 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11296 // Try to lower by permuting the inputs into an unpack instruction.
11297 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11301 // If we can't directly blend but can use PSHUFB, that will be better as it
11302 // can both shuffle and set up the inefficient blend.
11303 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11304 bool V1InUse, V2InUse;
11305 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11306 Zeroable, DAG, V1InUse, V2InUse);
11309 // We can always bit-blend if we have to so the fallback strategy is to
11310 // decompose into single-input permutes and blends.
11311 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11315 /// \brief Check whether a compaction lowering can be done by dropping even
11316 /// elements and compute how many times even elements must be dropped.
11318 /// This handles shuffles which take every Nth element where N is a power of
11319 /// two. Example shuffle masks:
11321 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11322 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11323 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11324 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11325 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11326 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11328 /// Any of these lanes can of course be undef.
11330 /// This routine only supports N <= 3.
11331 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11334 /// \returns N above, or the number of times even elements must be dropped if
11335 /// there is such a number. Otherwise returns zero.
11336 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11337 bool IsSingleInput) {
11338 // The modulus for the shuffle vector entries is based on whether this is
11339 // a single input or not.
11340 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11341 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11342 "We should only be called with masks with a power-of-2 size!");
11344 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11346 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11347 // and 2^3 simultaneously. This is because we may have ambiguity with
11348 // partially undef inputs.
11349 bool ViableForN[3] = {true, true, true};
11351 for (int i = 0, e = Mask.size(); i < e; ++i) {
11352 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11357 bool IsAnyViable = false;
11358 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11359 if (ViableForN[j]) {
11360 uint64_t N = j + 1;
11362 // The shuffle mask must be equal to (i * 2^N) % M.
11363 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11364 IsAnyViable = true;
11366 ViableForN[j] = false;
11368 // Early exit if we exhaust the possible powers of two.
11373 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11377 // Return 0 as there is no viable power of two.
11381 /// \brief Generic lowering of v16i8 shuffles.
11383 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11384 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11385 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11386 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11388 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11389 const APInt &Zeroable,
11390 SDValue V1, SDValue V2,
11391 const X86Subtarget &Subtarget,
11392 SelectionDAG &DAG) {
11393 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11394 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11395 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11397 // Try to use shift instructions.
11398 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11399 Zeroable, Subtarget, DAG))
11402 // Try to use byte rotation instructions.
11403 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11404 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11407 // Try to use a zext lowering.
11408 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11409 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11412 // See if we can use SSE4A Extraction / Insertion.
11413 if (Subtarget.hasSSE4A())
11414 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11418 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11420 // For single-input shuffles, there are some nicer lowering tricks we can use.
11421 if (NumV2Elements == 0) {
11422 // Check for being able to broadcast a single element.
11423 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11424 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11427 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11428 // Notably, this handles splat and partial-splat shuffles more efficiently.
11429 // However, it only makes sense if the pre-duplication shuffle simplifies
11430 // things significantly. Currently, this means we need to be able to
11431 // express the pre-duplication shuffle as an i16 shuffle.
11433 // FIXME: We should check for other patterns which can be widened into an
11434 // i16 shuffle as well.
11435 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11436 for (int i = 0; i < 16; i += 2)
11437 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11442 auto tryToWidenViaDuplication = [&]() -> SDValue {
11443 if (!canWidenViaDuplication(Mask))
11445 SmallVector<int, 4> LoInputs;
11446 copy_if(Mask, std::back_inserter(LoInputs),
11447 [](int M) { return M >= 0 && M < 8; });
11448 std::sort(LoInputs.begin(), LoInputs.end());
11449 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11451 SmallVector<int, 4> HiInputs;
11452 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11453 std::sort(HiInputs.begin(), HiInputs.end());
11454 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11457 bool TargetLo = LoInputs.size() >= HiInputs.size();
11458 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11459 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11461 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11462 SmallDenseMap<int, int, 8> LaneMap;
11463 for (int I : InPlaceInputs) {
11464 PreDupI16Shuffle[I/2] = I/2;
11467 int j = TargetLo ? 0 : 4, je = j + 4;
11468 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11469 // Check if j is already a shuffle of this input. This happens when
11470 // there are two adjacent bytes after we move the low one.
11471 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11472 // If we haven't yet mapped the input, search for a slot into which
11474 while (j < je && PreDupI16Shuffle[j] >= 0)
11478 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11481 // Map this input with the i16 shuffle.
11482 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11485 // Update the lane map based on the mapping we ended up with.
11486 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11488 V1 = DAG.getBitcast(
11490 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11491 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11493 // Unpack the bytes to form the i16s that will be shuffled into place.
11494 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11495 MVT::v16i8, V1, V1);
11497 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11498 for (int i = 0; i < 16; ++i)
11499 if (Mask[i] >= 0) {
11500 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11501 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11502 if (PostDupI16Shuffle[i / 2] < 0)
11503 PostDupI16Shuffle[i / 2] = MappedMask;
11505 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11506 "Conflicting entries in the original shuffle!");
11508 return DAG.getBitcast(
11510 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11511 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11513 if (SDValue V = tryToWidenViaDuplication())
11517 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11521 // Use dedicated unpack instructions for masks that match their pattern.
11523 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11526 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11527 // with PSHUFB. It is important to do this before we attempt to generate any
11528 // blends but after all of the single-input lowerings. If the single input
11529 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11530 // want to preserve that and we can DAG combine any longer sequences into
11531 // a PSHUFB in the end. But once we start blending from multiple inputs,
11532 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11533 // and there are *very* few patterns that would actually be faster than the
11534 // PSHUFB approach because of its ability to zero lanes.
11536 // FIXME: The only exceptions to the above are blends which are exact
11537 // interleavings with direct instructions supporting them. We currently don't
11538 // handle those well here.
11539 if (Subtarget.hasSSSE3()) {
11540 bool V1InUse = false;
11541 bool V2InUse = false;
11543 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11544 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11546 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11547 // do so. This avoids using them to handle blends-with-zero which is
11548 // important as a single pshufb is significantly faster for that.
11549 if (V1InUse && V2InUse) {
11550 if (Subtarget.hasSSE41())
11551 if (SDValue Blend = lowerVectorShuffleAsBlend(
11552 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11555 // We can use an unpack to do the blending rather than an or in some
11556 // cases. Even though the or may be (very minorly) more efficient, we
11557 // preference this lowering because there are common cases where part of
11558 // the complexity of the shuffles goes away when we do the final blend as
11560 // FIXME: It might be worth trying to detect if the unpack-feeding
11561 // shuffles will both be pshufb, in which case we shouldn't bother with
11563 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11564 DL, MVT::v16i8, V1, V2, Mask, DAG))
11571 // There are special ways we can lower some single-element blends.
11572 if (NumV2Elements == 1)
11573 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11574 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11577 if (SDValue BitBlend =
11578 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11581 // Check whether a compaction lowering can be done. This handles shuffles
11582 // which take every Nth element for some even N. See the helper function for
11585 // We special case these as they can be particularly efficiently handled with
11586 // the PACKUSB instruction on x86 and they show up in common patterns of
11587 // rearranging bytes to truncate wide elements.
11588 bool IsSingleInput = V2.isUndef();
11589 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11590 // NumEvenDrops is the power of two stride of the elements. Another way of
11591 // thinking about it is that we need to drop the even elements this many
11592 // times to get the original input.
11594 // First we need to zero all the dropped bytes.
11595 assert(NumEvenDrops <= 3 &&
11596 "No support for dropping even elements more than 3 times.");
11597 // We use the mask type to pick which bytes are preserved based on how many
11598 // elements are dropped.
11599 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11600 SDValue ByteClearMask = DAG.getBitcast(
11601 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11602 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11603 if (!IsSingleInput)
11604 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11606 // Now pack things back together.
11607 V1 = DAG.getBitcast(MVT::v8i16, V1);
11608 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11609 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11610 for (int i = 1; i < NumEvenDrops; ++i) {
11611 Result = DAG.getBitcast(MVT::v8i16, Result);
11612 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11618 // Handle multi-input cases by blending single-input shuffles.
11619 if (NumV2Elements > 0)
11620 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11623 // The fallback path for single-input shuffles widens this into two v8i16
11624 // vectors with unpacks, shuffles those, and then pulls them back together
11628 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11629 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11630 for (int i = 0; i < 16; ++i)
11632 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11634 SDValue VLoHalf, VHiHalf;
11635 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11636 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11638 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11639 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11640 // Use a mask to drop the high bytes.
11641 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11642 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11643 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11645 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11646 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11648 // Squash the masks to point directly into VLoHalf.
11649 for (int &M : LoBlendMask)
11652 for (int &M : HiBlendMask)
11656 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11657 // VHiHalf so that we can blend them as i16s.
11658 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11660 VLoHalf = DAG.getBitcast(
11661 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11662 VHiHalf = DAG.getBitcast(
11663 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11666 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11667 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11669 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11672 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11674 /// This routine breaks down the specific type of 128-bit shuffle and
11675 /// dispatches to the lowering routines accordingly.
11676 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11677 MVT VT, SDValue V1, SDValue V2,
11678 const APInt &Zeroable,
11679 const X86Subtarget &Subtarget,
11680 SelectionDAG &DAG) {
11681 switch (VT.SimpleTy) {
11683 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11685 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11687 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11689 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11691 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11693 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11696 llvm_unreachable("Unimplemented!");
11700 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11702 /// This routine just extracts two subvectors, shuffles them independently, and
11703 /// then concatenates them back together. This should work effectively with all
11704 /// AVX vector shuffle types.
11705 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11706 SDValue V2, ArrayRef<int> Mask,
11707 SelectionDAG &DAG) {
11708 assert(VT.getSizeInBits() >= 256 &&
11709 "Only for 256-bit or wider vector shuffles!");
11710 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11711 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11713 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11714 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11716 int NumElements = VT.getVectorNumElements();
11717 int SplitNumElements = NumElements / 2;
11718 MVT ScalarVT = VT.getVectorElementType();
11719 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11721 // Rather than splitting build-vectors, just build two narrower build
11722 // vectors. This helps shuffling with splats and zeros.
11723 auto SplitVector = [&](SDValue V) {
11724 V = peekThroughBitcasts(V);
11726 MVT OrigVT = V.getSimpleValueType();
11727 int OrigNumElements = OrigVT.getVectorNumElements();
11728 int OrigSplitNumElements = OrigNumElements / 2;
11729 MVT OrigScalarVT = OrigVT.getVectorElementType();
11730 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11734 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11736 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11737 DAG.getIntPtrConstant(0, DL));
11738 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11739 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11742 SmallVector<SDValue, 16> LoOps, HiOps;
11743 for (int i = 0; i < OrigSplitNumElements; ++i) {
11744 LoOps.push_back(BV->getOperand(i));
11745 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11747 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11748 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11750 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11751 DAG.getBitcast(SplitVT, HiV));
11754 SDValue LoV1, HiV1, LoV2, HiV2;
11755 std::tie(LoV1, HiV1) = SplitVector(V1);
11756 std::tie(LoV2, HiV2) = SplitVector(V2);
11758 // Now create two 4-way blends of these half-width vectors.
11759 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11760 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11761 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11762 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11763 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11764 for (int i = 0; i < SplitNumElements; ++i) {
11765 int M = HalfMask[i];
11766 if (M >= NumElements) {
11767 if (M >= NumElements + SplitNumElements)
11771 V2BlendMask[i] = M - NumElements;
11772 BlendMask[i] = SplitNumElements + i;
11773 } else if (M >= 0) {
11774 if (M >= SplitNumElements)
11778 V1BlendMask[i] = M;
11783 // Because the lowering happens after all combining takes place, we need to
11784 // manually combine these blend masks as much as possible so that we create
11785 // a minimal number of high-level vector shuffle nodes.
11787 // First try just blending the halves of V1 or V2.
11788 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11789 return DAG.getUNDEF(SplitVT);
11790 if (!UseLoV2 && !UseHiV2)
11791 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11792 if (!UseLoV1 && !UseHiV1)
11793 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11795 SDValue V1Blend, V2Blend;
11796 if (UseLoV1 && UseHiV1) {
11798 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11800 // We only use half of V1 so map the usage down into the final blend mask.
11801 V1Blend = UseLoV1 ? LoV1 : HiV1;
11802 for (int i = 0; i < SplitNumElements; ++i)
11803 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11804 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11806 if (UseLoV2 && UseHiV2) {
11808 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11810 // We only use half of V2 so map the usage down into the final blend mask.
11811 V2Blend = UseLoV2 ? LoV2 : HiV2;
11812 for (int i = 0; i < SplitNumElements; ++i)
11813 if (BlendMask[i] >= SplitNumElements)
11814 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11816 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11818 SDValue Lo = HalfBlend(LoMask);
11819 SDValue Hi = HalfBlend(HiMask);
11820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11823 /// \brief Either split a vector in halves or decompose the shuffles and the
11826 /// This is provided as a good fallback for many lowerings of non-single-input
11827 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11828 /// between splitting the shuffle into 128-bit components and stitching those
11829 /// back together vs. extracting the single-input shuffles and blending those
11831 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11832 SDValue V1, SDValue V2,
11833 ArrayRef<int> Mask,
11834 SelectionDAG &DAG) {
11835 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11836 "shuffles as it could then recurse on itself.");
11837 int Size = Mask.size();
11839 // If this can be modeled as a broadcast of two elements followed by a blend,
11840 // prefer that lowering. This is especially important because broadcasts can
11841 // often fold with memory operands.
11842 auto DoBothBroadcast = [&] {
11843 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11846 if (V2BroadcastIdx < 0)
11847 V2BroadcastIdx = M - Size;
11848 else if (M - Size != V2BroadcastIdx)
11850 } else if (M >= 0) {
11851 if (V1BroadcastIdx < 0)
11852 V1BroadcastIdx = M;
11853 else if (M != V1BroadcastIdx)
11858 if (DoBothBroadcast())
11859 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11862 // If the inputs all stem from a single 128-bit lane of each input, then we
11863 // split them rather than blending because the split will decompose to
11864 // unusually few instructions.
11865 int LaneCount = VT.getSizeInBits() / 128;
11866 int LaneSize = Size / LaneCount;
11867 SmallBitVector LaneInputs[2];
11868 LaneInputs[0].resize(LaneCount, false);
11869 LaneInputs[1].resize(LaneCount, false);
11870 for (int i = 0; i < Size; ++i)
11872 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11873 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11874 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11876 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11877 // that the decomposed single-input shuffles don't end up here.
11878 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11881 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11882 /// a permutation and blend of those lanes.
11884 /// This essentially blends the out-of-lane inputs to each lane into the lane
11885 /// from a permuted copy of the vector. This lowering strategy results in four
11886 /// instructions in the worst case for a single-input cross lane shuffle which
11887 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11888 /// of. Special cases for each particular shuffle pattern should be handled
11889 /// prior to trying this lowering.
11890 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11891 SDValue V1, SDValue V2,
11892 ArrayRef<int> Mask,
11893 SelectionDAG &DAG) {
11894 // FIXME: This should probably be generalized for 512-bit vectors as well.
11895 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11896 int Size = Mask.size();
11897 int LaneSize = Size / 2;
11899 // If there are only inputs from one 128-bit lane, splitting will in fact be
11900 // less expensive. The flags track whether the given lane contains an element
11901 // that crosses to another lane.
11902 bool LaneCrossing[2] = {false, false};
11903 for (int i = 0; i < Size; ++i)
11904 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11905 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11906 if (!LaneCrossing[0] || !LaneCrossing[1])
11907 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11909 assert(V2.isUndef() &&
11910 "This last part of this routine only works on single input shuffles");
11912 SmallVector<int, 32> FlippedBlendMask(Size);
11913 for (int i = 0; i < Size; ++i)
11914 FlippedBlendMask[i] =
11915 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11917 : Mask[i] % LaneSize +
11918 (i / LaneSize) * LaneSize + Size);
11920 // Flip the vector, and blend the results which should now be in-lane. The
11921 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11922 // 5 for the high source. The value 3 selects the high half of source 2 and
11923 // the value 2 selects the low half of source 2. We only use source 2 to
11924 // allow folding it into a memory operand.
11925 unsigned PERMMask = 3 | 2 << 4;
11926 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11927 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11928 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11931 /// \brief Handle lowering 2-lane 128-bit shuffles.
11932 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11933 SDValue V2, ArrayRef<int> Mask,
11934 const APInt &Zeroable,
11935 const X86Subtarget &Subtarget,
11936 SelectionDAG &DAG) {
11937 SmallVector<int, 4> WidenedMask;
11938 if (!canWidenShuffleElements(Mask, WidenedMask))
11941 // TODO: If minimizing size and one of the inputs is a zero vector and the
11942 // the zero vector has only one use, we could use a VPERM2X128 to save the
11943 // instruction bytes needed to explicitly generate the zero vector.
11945 // Blends are faster and handle all the non-lane-crossing cases.
11946 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11947 Zeroable, Subtarget, DAG))
11950 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11951 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11953 // If either input operand is a zero vector, use VPERM2X128 because its mask
11954 // allows us to replace the zero input with an implicit zero.
11955 if (!IsV1Zero && !IsV2Zero) {
11956 // Check for patterns which can be matched with a single insert of a 128-bit
11958 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11959 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11960 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11961 if (Subtarget.hasAVX2() && V2.isUndef())
11964 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11965 VT.getVectorNumElements() / 2);
11966 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11967 DAG.getIntPtrConstant(0, DL));
11968 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11969 OnlyUsesV1 ? V1 : V2,
11970 DAG.getIntPtrConstant(0, DL));
11971 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11975 // Otherwise form a 128-bit permutation. After accounting for undefs,
11976 // convert the 64-bit shuffle mask selection values into 128-bit
11977 // selection bits by dividing the indexes by 2 and shifting into positions
11978 // defined by a vperm2*128 instruction's immediate control byte.
11980 // The immediate permute control byte looks like this:
11981 // [1:0] - select 128 bits from sources for low half of destination
11983 // [3] - zero low half of destination
11984 // [5:4] - select 128 bits from sources for high half of destination
11986 // [7] - zero high half of destination
11988 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11989 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11991 unsigned PermMask = MaskLO | (MaskHI << 4);
11993 // If either input is a zero vector, replace it with an undef input.
11994 // Shuffle mask values < 4 are selecting elements of V1.
11995 // Shuffle mask values >= 4 are selecting elements of V2.
11996 // Adjust each half of the permute mask by clearing the half that was
11997 // selecting the zero vector and setting the zero mask bit.
11999 V1 = DAG.getUNDEF(VT);
12001 PermMask = (PermMask & 0xf0) | 0x08;
12003 PermMask = (PermMask & 0x0f) | 0x80;
12006 V2 = DAG.getUNDEF(VT);
12008 PermMask = (PermMask & 0xf0) | 0x08;
12010 PermMask = (PermMask & 0x0f) | 0x80;
12013 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12014 DAG.getConstant(PermMask, DL, MVT::i8));
12017 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12018 /// shuffling each lane.
12020 /// This will only succeed when the result of fixing the 128-bit lanes results
12021 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12022 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12023 /// the lane crosses early and then use simpler shuffles within each lane.
12025 /// FIXME: It might be worthwhile at some point to support this without
12026 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12027 /// in x86 only floating point has interesting non-repeating shuffles, and even
12028 /// those are still *marginally* more expensive.
12029 static SDValue lowerVectorShuffleByMerging128BitLanes(
12030 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12031 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12032 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12034 int Size = Mask.size();
12035 int LaneSize = 128 / VT.getScalarSizeInBits();
12036 int NumLanes = Size / LaneSize;
12037 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12039 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12040 // check whether the in-128-bit lane shuffles share a repeating pattern.
12041 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12042 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12043 for (int i = 0; i < Size; ++i) {
12047 int j = i / LaneSize;
12049 if (Lanes[j] < 0) {
12050 // First entry we've seen for this lane.
12051 Lanes[j] = Mask[i] / LaneSize;
12052 } else if (Lanes[j] != Mask[i] / LaneSize) {
12053 // This doesn't match the lane selected previously!
12057 // Check that within each lane we have a consistent shuffle mask.
12058 int k = i % LaneSize;
12059 if (InLaneMask[k] < 0) {
12060 InLaneMask[k] = Mask[i] % LaneSize;
12061 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12062 // This doesn't fit a repeating in-lane mask.
12067 // First shuffle the lanes into place.
12068 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12069 VT.getSizeInBits() / 64);
12070 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12071 for (int i = 0; i < NumLanes; ++i)
12072 if (Lanes[i] >= 0) {
12073 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12074 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12077 V1 = DAG.getBitcast(LaneVT, V1);
12078 V2 = DAG.getBitcast(LaneVT, V2);
12079 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12081 // Cast it back to the type we actually want.
12082 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12084 // Now do a simple shuffle that isn't lane crossing.
12085 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12086 for (int i = 0; i < Size; ++i)
12088 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12089 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12090 "Must not introduce lane crosses at this point!");
12092 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12095 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12096 /// This allows for fast cases such as subvector extraction/insertion
12097 /// or shuffling smaller vector types which can lower more efficiently.
12098 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12099 SDValue V1, SDValue V2,
12100 ArrayRef<int> Mask,
12101 const X86Subtarget &Subtarget,
12102 SelectionDAG &DAG) {
12103 assert(VT.is256BitVector() && "Expected 256-bit vector");
12105 unsigned NumElts = VT.getVectorNumElements();
12106 unsigned HalfNumElts = NumElts / 2;
12107 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12109 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12110 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12111 if (!UndefLower && !UndefUpper)
12114 // Upper half is undef and lower half is whole upper subvector.
12115 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12117 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12118 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12119 DAG.getIntPtrConstant(HalfNumElts, DL));
12120 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12121 DAG.getIntPtrConstant(0, DL));
12124 // Lower half is undef and upper half is whole lower subvector.
12125 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12127 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12128 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12129 DAG.getIntPtrConstant(0, DL));
12130 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12131 DAG.getIntPtrConstant(HalfNumElts, DL));
12134 // If the shuffle only uses two of the four halves of the input operands,
12135 // then extract them and perform the 'half' shuffle at half width.
12136 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12137 int HalfIdx1 = -1, HalfIdx2 = -1;
12138 SmallVector<int, 8> HalfMask(HalfNumElts);
12139 unsigned Offset = UndefLower ? HalfNumElts : 0;
12140 for (unsigned i = 0; i != HalfNumElts; ++i) {
12141 int M = Mask[i + Offset];
12147 // Determine which of the 4 half vectors this element is from.
12148 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12149 int HalfIdx = M / HalfNumElts;
12151 // Determine the element index into its half vector source.
12152 int HalfElt = M % HalfNumElts;
12154 // We can shuffle with up to 2 half vectors, set the new 'half'
12155 // shuffle mask accordingly.
12156 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12157 HalfMask[i] = HalfElt;
12158 HalfIdx1 = HalfIdx;
12161 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12162 HalfMask[i] = HalfElt + HalfNumElts;
12163 HalfIdx2 = HalfIdx;
12167 // Too many half vectors referenced.
12170 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12172 // Only shuffle the halves of the inputs when useful.
12173 int NumLowerHalves =
12174 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12175 int NumUpperHalves =
12176 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12178 // uuuuXXXX - don't extract uppers just to insert again.
12179 if (UndefLower && NumUpperHalves != 0)
12182 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12183 if (UndefUpper && NumUpperHalves == 2)
12186 // AVX2 - XXXXuuuu - always extract lowers.
12187 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12188 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12189 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12191 // AVX2 supports variable 32-bit element cross-lane shuffles.
12192 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12193 // XXXXuuuu - don't extract lowers and uppers.
12194 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12199 auto GetHalfVector = [&](int HalfIdx) {
12201 return DAG.getUNDEF(HalfVT);
12202 SDValue V = (HalfIdx < 2 ? V1 : V2);
12203 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12204 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12205 DAG.getIntPtrConstant(HalfIdx, DL));
12208 SDValue Half1 = GetHalfVector(HalfIdx1);
12209 SDValue Half2 = GetHalfVector(HalfIdx2);
12210 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12211 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12212 DAG.getIntPtrConstant(Offset, DL));
12215 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12218 /// This returns true if the elements from a particular input are already in the
12219 /// slot required by the given mask and require no permutation.
12220 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12221 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12222 int Size = Mask.size();
12223 for (int i = 0; i < Size; ++i)
12224 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12230 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12231 /// every lane can be represented as the same repeating mask - allowing us to
12232 /// shuffle the sources with the repeating shuffle and then permute the result
12233 /// to the destination lanes.
12234 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12235 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12236 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12237 int NumElts = VT.getVectorNumElements();
12238 int NumLanes = VT.getSizeInBits() / 128;
12239 int NumLaneElts = NumElts / NumLanes;
12241 // On AVX2 we may be able to just shuffle the lowest elements and then
12242 // broadcast the result.
12243 if (Subtarget.hasAVX2()) {
12244 for (unsigned BroadcastSize : {16, 32, 64}) {
12245 if (BroadcastSize <= VT.getScalarSizeInBits())
12247 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12249 // Attempt to match a repeating pattern every NumBroadcastElts,
12250 // accounting for UNDEFs but only references the lowest 128-bit
12251 // lane of the inputs.
12252 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12253 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12254 for (int j = 0; j != NumBroadcastElts; ++j) {
12255 int M = Mask[i + j];
12258 int &R = RepeatMask[j];
12259 if (0 != ((M % NumElts) / NumLaneElts))
12261 if (0 <= R && R != M)
12268 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12269 if (!FindRepeatingBroadcastMask(RepeatMask))
12272 // Shuffle the (lowest) repeated elements in place for broadcast.
12273 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12275 // Shuffle the actual broadcast.
12276 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12277 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12278 for (int j = 0; j != NumBroadcastElts; ++j)
12279 BroadcastMask[i + j] = j;
12280 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12285 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12286 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12289 // Bail if we already have a repeated lane shuffle mask.
12290 SmallVector<int, 8> RepeatedShuffleMask;
12291 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12294 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12295 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12296 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12297 int NumSubLanes = NumLanes * SubLaneScale;
12298 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12300 // Check that all the sources are coming from the same lane and see if we can
12301 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12302 // determine the source sub-lane for each destination sub-lane.
12303 int TopSrcSubLane = -1;
12304 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12305 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12306 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12307 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12309 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12310 // Extract the sub-lane mask, check that it all comes from the same lane
12311 // and normalize the mask entries to come from the first lane.
12313 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12314 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12315 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12318 int Lane = (M % NumElts) / NumLaneElts;
12319 if ((0 <= SrcLane) && (SrcLane != Lane))
12322 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12323 SubLaneMask[Elt] = LocalM;
12326 // Whole sub-lane is UNDEF.
12330 // Attempt to match against the candidate repeated sub-lane masks.
12331 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12332 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12333 for (int i = 0; i != NumSubLaneElts; ++i) {
12334 if (M1[i] < 0 || M2[i] < 0)
12336 if (M1[i] != M2[i])
12342 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12343 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12346 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12347 for (int i = 0; i != NumSubLaneElts; ++i) {
12348 int M = SubLaneMask[i];
12351 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12352 "Unexpected mask element");
12353 RepeatedSubLaneMask[i] = M;
12356 // Track the top most source sub-lane - by setting the remaining to UNDEF
12357 // we can greatly simplify shuffle matching.
12358 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12359 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12360 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12364 // Bail if we failed to find a matching repeated sub-lane mask.
12365 if (Dst2SrcSubLanes[DstSubLane] < 0)
12368 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12369 "Unexpected source lane");
12371 // Create a repeating shuffle mask for the entire vector.
12372 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12373 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12374 int Lane = SubLane / SubLaneScale;
12375 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12376 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12377 int M = RepeatedSubLaneMask[Elt];
12380 int Idx = (SubLane * NumSubLaneElts) + Elt;
12381 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12384 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12386 // Shuffle each source sub-lane to its destination.
12387 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12388 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12389 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12390 if (SrcSubLane < 0)
12392 for (int j = 0; j != NumSubLaneElts; ++j)
12393 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12396 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12400 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12401 unsigned &ShuffleImm,
12402 ArrayRef<int> Mask) {
12403 int NumElts = VT.getVectorNumElements();
12404 assert(VT.getScalarSizeInBits() == 64 &&
12405 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12406 "Unexpected data type for VSHUFPD");
12408 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12409 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12411 bool ShufpdMask = true;
12412 bool CommutableMask = true;
12413 for (int i = 0; i < NumElts; ++i) {
12414 if (Mask[i] == SM_SentinelUndef)
12418 int Val = (i & 6) + NumElts * (i & 1);
12419 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12420 if (Mask[i] < Val || Mask[i] > Val + 1)
12421 ShufpdMask = false;
12422 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12423 CommutableMask = false;
12424 ShuffleImm |= (Mask[i] % 2) << i;
12429 if (CommutableMask) {
12437 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12438 ArrayRef<int> Mask, SDValue V1,
12439 SDValue V2, SelectionDAG &DAG) {
12440 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12441 "Unexpected data type for VSHUFPD");
12443 unsigned Immediate = 0;
12444 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12447 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12448 DAG.getConstant(Immediate, DL, MVT::i8));
12451 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12452 ArrayRef<int> Mask, SDValue V1,
12453 SDValue V2, SelectionDAG &DAG) {
12454 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12455 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12457 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12459 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12461 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12464 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12466 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12467 /// isn't available.
12468 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12469 const APInt &Zeroable,
12470 SDValue V1, SDValue V2,
12471 const X86Subtarget &Subtarget,
12472 SelectionDAG &DAG) {
12473 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12474 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12475 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12477 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12478 Zeroable, Subtarget, DAG))
12481 if (V2.isUndef()) {
12482 // Check for being able to broadcast a single element.
12483 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12484 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12487 // Use low duplicate instructions for masks that match their pattern.
12488 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12489 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12491 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12492 // Non-half-crossing single input shuffles can be lowered with an
12493 // interleaved permutation.
12494 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12495 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12496 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12497 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12500 // With AVX2 we have direct support for this permutation.
12501 if (Subtarget.hasAVX2())
12502 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12503 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12505 // Try to create an in-lane repeating shuffle mask and then shuffle the
12506 // the results into the target lanes.
12507 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12508 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12511 // Otherwise, fall back.
12512 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12516 // Use dedicated unpack instructions for masks that match their pattern.
12518 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12521 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12522 Zeroable, Subtarget, DAG))
12525 // Check if the blend happens to exactly fit that of SHUFPD.
12527 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12530 // Try to create an in-lane repeating shuffle mask and then shuffle the
12531 // the results into the target lanes.
12532 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12533 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12536 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12537 // shuffle. However, if we have AVX2 and either inputs are already in place,
12538 // we will be able to shuffle even across lanes the other input in a single
12539 // instruction so skip this pattern.
12540 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12541 isShuffleMaskInputInPlace(1, Mask))))
12542 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12543 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12545 // If we have VLX support, we can use VEXPAND.
12546 if (Subtarget.hasVLX())
12547 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12548 V1, V2, DAG, Subtarget))
12551 // If we have AVX2 then we always want to lower with a blend because an v4 we
12552 // can fully permute the elements.
12553 if (Subtarget.hasAVX2())
12554 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12557 // Otherwise fall back on generic lowering.
12558 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12561 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12563 /// This routine is only called when we have AVX2 and thus a reasonable
12564 /// instruction set for v4i64 shuffling..
12565 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12566 const APInt &Zeroable,
12567 SDValue V1, SDValue V2,
12568 const X86Subtarget &Subtarget,
12569 SelectionDAG &DAG) {
12570 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12571 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12572 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12573 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12575 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12576 Zeroable, Subtarget, DAG))
12579 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12580 Zeroable, Subtarget, DAG))
12583 // Check for being able to broadcast a single element.
12584 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12585 Mask, Subtarget, DAG))
12588 if (V2.isUndef()) {
12589 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12590 // can use lower latency instructions that will operate on both lanes.
12591 SmallVector<int, 2> RepeatedMask;
12592 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12593 SmallVector<int, 4> PSHUFDMask;
12594 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12595 return DAG.getBitcast(
12597 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12598 DAG.getBitcast(MVT::v8i32, V1),
12599 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12602 // AVX2 provides a direct instruction for permuting a single input across
12604 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12605 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12608 // Try to use shift instructions.
12609 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12610 Zeroable, Subtarget, DAG))
12613 // If we have VLX support, we can use VALIGN or VEXPAND.
12614 if (Subtarget.hasVLX()) {
12615 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12616 Mask, Subtarget, DAG))
12619 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12620 V1, V2, DAG, Subtarget))
12624 // Try to use PALIGNR.
12625 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12626 Mask, Subtarget, DAG))
12629 // Use dedicated unpack instructions for masks that match their pattern.
12631 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12634 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12635 // shuffle. However, if we have AVX2 and either inputs are already in place,
12636 // we will be able to shuffle even across lanes the other input in a single
12637 // instruction so skip this pattern.
12638 if (!isShuffleMaskInputInPlace(0, Mask) &&
12639 !isShuffleMaskInputInPlace(1, Mask))
12640 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12641 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12644 // Otherwise fall back on generic blend lowering.
12645 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12649 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12651 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12652 /// isn't available.
12653 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12654 const APInt &Zeroable,
12655 SDValue V1, SDValue V2,
12656 const X86Subtarget &Subtarget,
12657 SelectionDAG &DAG) {
12658 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12659 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12660 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12662 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12663 Zeroable, Subtarget, DAG))
12666 // Check for being able to broadcast a single element.
12667 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12668 Mask, Subtarget, DAG))
12671 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12672 // options to efficiently lower the shuffle.
12673 SmallVector<int, 4> RepeatedMask;
12674 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12675 assert(RepeatedMask.size() == 4 &&
12676 "Repeated masks must be half the mask width!");
12678 // Use even/odd duplicate instructions for masks that match their pattern.
12679 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12680 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12681 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12682 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12685 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12686 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12688 // Use dedicated unpack instructions for masks that match their pattern.
12690 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12693 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12694 // have already handled any direct blends.
12695 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12698 // Try to create an in-lane repeating shuffle mask and then shuffle the
12699 // the results into the target lanes.
12700 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12701 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12704 // If we have a single input shuffle with different shuffle patterns in the
12705 // two 128-bit lanes use the variable mask to VPERMILPS.
12706 if (V2.isUndef()) {
12707 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12708 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12709 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12711 if (Subtarget.hasAVX2())
12712 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12714 // Otherwise, fall back.
12715 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12719 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12721 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12722 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12724 // If we have VLX support, we can use VEXPAND.
12725 if (Subtarget.hasVLX())
12726 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12727 V1, V2, DAG, Subtarget))
12730 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12731 // since after split we get a more efficient code using vpunpcklwd and
12732 // vpunpckhwd instrs than vblend.
12733 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12734 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12738 // If we have AVX2 then we always want to lower with a blend because at v8 we
12739 // can fully permute the elements.
12740 if (Subtarget.hasAVX2())
12741 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12744 // Otherwise fall back on generic lowering.
12745 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12748 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12750 /// This routine is only called when we have AVX2 and thus a reasonable
12751 /// instruction set for v8i32 shuffling..
12752 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12753 const APInt &Zeroable,
12754 SDValue V1, SDValue V2,
12755 const X86Subtarget &Subtarget,
12756 SelectionDAG &DAG) {
12757 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12758 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12759 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12760 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12762 // Whenever we can lower this as a zext, that instruction is strictly faster
12763 // than any alternative. It also allows us to fold memory operands into the
12764 // shuffle in many cases.
12765 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12766 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12769 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12770 // since after split we get a more efficient code than vblend by using
12771 // vpunpcklwd and vpunpckhwd instrs.
12772 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12773 !Subtarget.hasAVX512())
12775 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12778 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12779 Zeroable, Subtarget, DAG))
12782 // Check for being able to broadcast a single element.
12783 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12784 Mask, Subtarget, DAG))
12787 // If the shuffle mask is repeated in each 128-bit lane we can use more
12788 // efficient instructions that mirror the shuffles across the two 128-bit
12790 SmallVector<int, 4> RepeatedMask;
12791 bool Is128BitLaneRepeatedShuffle =
12792 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12793 if (Is128BitLaneRepeatedShuffle) {
12794 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12796 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12797 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12799 // Use dedicated unpack instructions for masks that match their pattern.
12801 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12805 // Try to use shift instructions.
12806 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12807 Zeroable, Subtarget, DAG))
12810 // If we have VLX support, we can use VALIGN or EXPAND.
12811 if (Subtarget.hasVLX()) {
12812 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12813 Mask, Subtarget, DAG))
12816 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12817 V1, V2, DAG, Subtarget))
12821 // Try to use byte rotation instructions.
12822 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12823 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12826 // Try to create an in-lane repeating shuffle mask and then shuffle the
12827 // results into the target lanes.
12828 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12829 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12832 // If the shuffle patterns aren't repeated but it is a single input, directly
12833 // generate a cross-lane VPERMD instruction.
12834 if (V2.isUndef()) {
12835 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12836 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12839 // Assume that a single SHUFPS is faster than an alternative sequence of
12840 // multiple instructions (even if the CPU has a domain penalty).
12841 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12842 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12843 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12844 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12845 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12846 CastV1, CastV2, DAG);
12847 return DAG.getBitcast(MVT::v8i32, ShufPS);
12850 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12852 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12853 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12856 // Otherwise fall back on generic blend lowering.
12857 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12861 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12863 /// This routine is only called when we have AVX2 and thus a reasonable
12864 /// instruction set for v16i16 shuffling..
12865 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12866 const APInt &Zeroable,
12867 SDValue V1, SDValue V2,
12868 const X86Subtarget &Subtarget,
12869 SelectionDAG &DAG) {
12870 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12871 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12872 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12873 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12875 // Whenever we can lower this as a zext, that instruction is strictly faster
12876 // than any alternative. It also allows us to fold memory operands into the
12877 // shuffle in many cases.
12878 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12879 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12882 // Check for being able to broadcast a single element.
12883 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12884 Mask, Subtarget, DAG))
12887 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12888 Zeroable, Subtarget, DAG))
12891 // Use dedicated unpack instructions for masks that match their pattern.
12893 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12896 // Try to use shift instructions.
12897 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12898 Zeroable, Subtarget, DAG))
12901 // Try to use byte rotation instructions.
12902 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12903 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12906 // Try to create an in-lane repeating shuffle mask and then shuffle the
12907 // the results into the target lanes.
12908 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12909 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12912 if (V2.isUndef()) {
12913 // There are no generalized cross-lane shuffle operations available on i16
12915 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12916 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12919 SmallVector<int, 8> RepeatedMask;
12920 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12921 // As this is a single-input shuffle, the repeated mask should be
12922 // a strictly valid v8i16 mask that we can pass through to the v8i16
12923 // lowering to handle even the v16 case.
12924 return lowerV8I16GeneralSingleInputVectorShuffle(
12925 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12929 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12930 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12933 // AVX512BWVL can lower to VPERMW.
12934 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12935 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12937 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12939 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12940 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12943 // Otherwise fall back on generic lowering.
12944 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12947 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12949 /// This routine is only called when we have AVX2 and thus a reasonable
12950 /// instruction set for v32i8 shuffling..
12951 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12952 const APInt &Zeroable,
12953 SDValue V1, SDValue V2,
12954 const X86Subtarget &Subtarget,
12955 SelectionDAG &DAG) {
12956 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12957 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12958 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12959 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12961 // Whenever we can lower this as a zext, that instruction is strictly faster
12962 // than any alternative. It also allows us to fold memory operands into the
12963 // shuffle in many cases.
12964 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12965 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12968 // Check for being able to broadcast a single element.
12969 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12970 Mask, Subtarget, DAG))
12973 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12974 Zeroable, Subtarget, DAG))
12977 // Use dedicated unpack instructions for masks that match their pattern.
12979 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12982 // Try to use shift instructions.
12983 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12984 Zeroable, Subtarget, DAG))
12987 // Try to use byte rotation instructions.
12988 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12989 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12992 // Try to create an in-lane repeating shuffle mask and then shuffle the
12993 // the results into the target lanes.
12994 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12995 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12998 // There are no generalized cross-lane shuffle operations available on i8
13000 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13001 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13004 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13005 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13008 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13010 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13011 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13014 // Otherwise fall back on generic lowering.
13015 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13018 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13020 /// This routine either breaks down the specific type of a 256-bit x86 vector
13021 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13022 /// together based on the available instructions.
13023 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13024 MVT VT, SDValue V1, SDValue V2,
13025 const APInt &Zeroable,
13026 const X86Subtarget &Subtarget,
13027 SelectionDAG &DAG) {
13028 // If we have a single input to the zero element, insert that into V1 if we
13029 // can do so cheaply.
13030 int NumElts = VT.getVectorNumElements();
13031 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13033 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13034 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13035 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13038 // Handle special cases where the lower or upper half is UNDEF.
13040 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13043 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13044 // can check for those subtargets here and avoid much of the subtarget
13045 // querying in the per-vector-type lowering routines. With AVX1 we have
13046 // essentially *zero* ability to manipulate a 256-bit vector with integer
13047 // types. Since we'll use floating point types there eventually, just
13048 // immediately cast everything to a float and operate entirely in that domain.
13049 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13050 int ElementBits = VT.getScalarSizeInBits();
13051 if (ElementBits < 32) {
13052 // No floating point type available, if we can't use the bit operations
13053 // for masking/blending then decompose into 128-bit vectors.
13055 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13057 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13059 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13062 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13063 VT.getVectorNumElements());
13064 V1 = DAG.getBitcast(FpVT, V1);
13065 V2 = DAG.getBitcast(FpVT, V2);
13066 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13069 switch (VT.SimpleTy) {
13071 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13073 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13075 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13077 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13079 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13081 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13084 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13088 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13089 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13090 ArrayRef<int> Mask, SDValue V1,
13091 SDValue V2, SelectionDAG &DAG) {
13092 assert(VT.getScalarSizeInBits() == 64 &&
13093 "Unexpected element type size for 128bit shuffle.");
13095 // To handle 256 bit vector requires VLX and most probably
13096 // function lowerV2X128VectorShuffle() is better solution.
13097 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13099 SmallVector<int, 4> WidenedMask;
13100 if (!canWidenShuffleElements(Mask, WidenedMask))
13103 // Check for patterns which can be matched with a single insert of a 256-bit
13105 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13106 {0, 1, 2, 3, 0, 1, 2, 3});
13107 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13108 {0, 1, 2, 3, 8, 9, 10, 11})) {
13109 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13110 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13111 DAG.getIntPtrConstant(0, DL));
13112 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13113 OnlyUsesV1 ? V1 : V2,
13114 DAG.getIntPtrConstant(0, DL));
13115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13118 assert(WidenedMask.size() == 4);
13120 // See if this is an insertion of the lower 128-bits of V2 into V1.
13121 bool IsInsert = true;
13123 for (int i = 0; i < 4; ++i) {
13124 assert(WidenedMask[i] >= -1);
13125 if (WidenedMask[i] < 0)
13128 // Make sure all V1 subvectors are in place.
13129 if (WidenedMask[i] < 4) {
13130 if (WidenedMask[i] != i) {
13135 // Make sure we only have a single V2 index and its the lowest 128-bits.
13136 if (V2Index >= 0 || WidenedMask[i] != 4) {
13143 if (IsInsert && V2Index >= 0) {
13144 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13145 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13146 DAG.getIntPtrConstant(0, DL));
13147 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13150 // Try to lower to to vshuf64x2/vshuf32x4.
13151 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13152 unsigned PermMask = 0;
13153 // Insure elements came from the same Op.
13154 for (int i = 0; i < 4; ++i) {
13155 assert(WidenedMask[i] >= -1);
13156 if (WidenedMask[i] < 0)
13159 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13160 unsigned OpIndex = i / 2;
13161 if (Ops[OpIndex].isUndef())
13163 else if (Ops[OpIndex] != Op)
13166 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13167 // bits defined by a vshuf64x2 instruction's immediate control byte.
13168 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13171 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13172 DAG.getConstant(PermMask, DL, MVT::i8));
13175 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13176 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13177 const APInt &Zeroable,
13178 SDValue V1, SDValue V2,
13179 const X86Subtarget &Subtarget,
13180 SelectionDAG &DAG) {
13181 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13182 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13183 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13185 if (V2.isUndef()) {
13186 // Use low duplicate instructions for masks that match their pattern.
13187 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13188 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13190 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13191 // Non-half-crossing single input shuffles can be lowered with an
13192 // interleaved permutation.
13193 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13194 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13195 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13196 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13197 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13198 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13201 SmallVector<int, 4> RepeatedMask;
13202 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13203 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13204 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13207 if (SDValue Shuf128 =
13208 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13211 if (SDValue Unpck =
13212 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13215 // Check if the blend happens to exactly fit that of SHUFPD.
13217 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13220 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13221 V2, DAG, Subtarget))
13224 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13225 Zeroable, Subtarget, DAG))
13228 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13231 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13232 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13233 const APInt &Zeroable,
13234 SDValue V1, SDValue V2,
13235 const X86Subtarget &Subtarget,
13236 SelectionDAG &DAG) {
13237 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13238 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13239 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13241 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13242 // options to efficiently lower the shuffle.
13243 SmallVector<int, 4> RepeatedMask;
13244 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13245 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13247 // Use even/odd duplicate instructions for masks that match their pattern.
13248 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13249 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13250 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13251 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13254 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13255 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13257 // Use dedicated unpack instructions for masks that match their pattern.
13258 if (SDValue Unpck =
13259 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13262 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13263 Zeroable, Subtarget, DAG))
13266 // Otherwise, fall back to a SHUFPS sequence.
13267 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13269 // If we have AVX512F support, we can use VEXPAND.
13270 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13271 V1, V2, DAG, Subtarget))
13274 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13277 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13278 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13279 const APInt &Zeroable,
13280 SDValue V1, SDValue V2,
13281 const X86Subtarget &Subtarget,
13282 SelectionDAG &DAG) {
13283 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13284 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13285 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13287 if (SDValue Shuf128 =
13288 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13291 if (V2.isUndef()) {
13292 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13293 // can use lower latency instructions that will operate on all four
13295 SmallVector<int, 2> Repeated128Mask;
13296 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13297 SmallVector<int, 4> PSHUFDMask;
13298 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13299 return DAG.getBitcast(
13301 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13302 DAG.getBitcast(MVT::v16i32, V1),
13303 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13306 SmallVector<int, 4> Repeated256Mask;
13307 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13308 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13309 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13312 // Try to use shift instructions.
13313 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13314 Zeroable, Subtarget, DAG))
13317 // Try to use VALIGN.
13318 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13319 Mask, Subtarget, DAG))
13322 // Try to use PALIGNR.
13323 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13324 Mask, Subtarget, DAG))
13327 if (SDValue Unpck =
13328 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13330 // If we have AVX512F support, we can use VEXPAND.
13331 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13332 V2, DAG, Subtarget))
13335 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13336 Zeroable, Subtarget, DAG))
13339 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13342 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13343 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13344 const APInt &Zeroable,
13345 SDValue V1, SDValue V2,
13346 const X86Subtarget &Subtarget,
13347 SelectionDAG &DAG) {
13348 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13349 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13350 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13352 // Whenever we can lower this as a zext, that instruction is strictly faster
13353 // than any alternative. It also allows us to fold memory operands into the
13354 // shuffle in many cases.
13355 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13356 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13359 // If the shuffle mask is repeated in each 128-bit lane we can use more
13360 // efficient instructions that mirror the shuffles across the four 128-bit
13362 SmallVector<int, 4> RepeatedMask;
13363 bool Is128BitLaneRepeatedShuffle =
13364 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13365 if (Is128BitLaneRepeatedShuffle) {
13366 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13368 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13369 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13371 // Use dedicated unpack instructions for masks that match their pattern.
13373 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13377 // Try to use shift instructions.
13378 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13379 Zeroable, Subtarget, DAG))
13382 // Try to use VALIGN.
13383 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13384 Mask, Subtarget, DAG))
13387 // Try to use byte rotation instructions.
13388 if (Subtarget.hasBWI())
13389 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13390 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13393 // Assume that a single SHUFPS is faster than using a permv shuffle.
13394 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13395 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13396 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13397 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13398 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13399 CastV1, CastV2, DAG);
13400 return DAG.getBitcast(MVT::v16i32, ShufPS);
13402 // If we have AVX512F support, we can use VEXPAND.
13403 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13404 V1, V2, DAG, Subtarget))
13407 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13408 Zeroable, Subtarget, DAG))
13410 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13413 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13414 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13415 const APInt &Zeroable,
13416 SDValue V1, SDValue V2,
13417 const X86Subtarget &Subtarget,
13418 SelectionDAG &DAG) {
13419 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13420 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13421 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13422 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13424 // Whenever we can lower this as a zext, that instruction is strictly faster
13425 // than any alternative. It also allows us to fold memory operands into the
13426 // shuffle in many cases.
13427 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13428 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13431 // Use dedicated unpack instructions for masks that match their pattern.
13433 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13436 // Try to use shift instructions.
13437 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13438 Zeroable, Subtarget, DAG))
13441 // Try to use byte rotation instructions.
13442 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13443 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13446 if (V2.isUndef()) {
13447 SmallVector<int, 8> RepeatedMask;
13448 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13449 // As this is a single-input shuffle, the repeated mask should be
13450 // a strictly valid v8i16 mask that we can pass through to the v8i16
13451 // lowering to handle even the v32 case.
13452 return lowerV8I16GeneralSingleInputVectorShuffle(
13453 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13457 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13458 Zeroable, Subtarget, DAG))
13461 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13464 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13465 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13466 const APInt &Zeroable,
13467 SDValue V1, SDValue V2,
13468 const X86Subtarget &Subtarget,
13469 SelectionDAG &DAG) {
13470 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13471 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13472 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13473 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13475 // Whenever we can lower this as a zext, that instruction is strictly faster
13476 // than any alternative. It also allows us to fold memory operands into the
13477 // shuffle in many cases.
13478 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13479 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13482 // Use dedicated unpack instructions for masks that match their pattern.
13484 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13487 // Try to use shift instructions.
13488 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13489 Zeroable, Subtarget, DAG))
13492 // Try to use byte rotation instructions.
13493 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13494 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13497 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13498 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13501 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13502 if (Subtarget.hasVBMI())
13503 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13505 // Try to create an in-lane repeating shuffle mask and then shuffle the
13506 // the results into the target lanes.
13507 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13508 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13511 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13512 Zeroable, Subtarget, DAG))
13515 // FIXME: Implement direct support for this type!
13516 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13519 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13521 /// This routine either breaks down the specific type of a 512-bit x86 vector
13522 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13523 /// together based on the available instructions.
13524 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13525 MVT VT, SDValue V1, SDValue V2,
13526 const APInt &Zeroable,
13527 const X86Subtarget &Subtarget,
13528 SelectionDAG &DAG) {
13529 assert(Subtarget.hasAVX512() &&
13530 "Cannot lower 512-bit vectors w/ basic ISA!");
13532 // If we have a single input to the zero element, insert that into V1 if we
13533 // can do so cheaply.
13534 int NumElts = Mask.size();
13535 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13537 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13538 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13539 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13542 // Check for being able to broadcast a single element.
13543 if (SDValue Broadcast =
13544 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13547 // Dispatch to each element type for lowering. If we don't have support for
13548 // specific element type shuffles at 512 bits, immediately split them and
13549 // lower them. Each lowering routine of a given type is allowed to assume that
13550 // the requisite ISA extensions for that element type are available.
13551 switch (VT.SimpleTy) {
13553 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13555 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13557 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13559 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13561 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13563 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13566 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13570 // Lower vXi1 vector shuffles.
13571 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13572 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13573 // vector, shuffle and then truncate it back.
13574 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13575 MVT VT, SDValue V1, SDValue V2,
13576 const X86Subtarget &Subtarget,
13577 SelectionDAG &DAG) {
13578 assert(Subtarget.hasAVX512() &&
13579 "Cannot lower 512-bit vectors w/o basic ISA!");
13581 switch (VT.SimpleTy) {
13583 llvm_unreachable("Expected a vector of i1 elements");
13585 ExtVT = MVT::v2i64;
13588 ExtVT = MVT::v4i32;
13591 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13594 ExtVT = MVT::v16i32;
13597 ExtVT = MVT::v32i16;
13600 ExtVT = MVT::v64i8;
13604 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13605 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13606 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13607 V1 = getOnesVector(ExtVT, DAG, DL);
13609 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13612 V2 = DAG.getUNDEF(ExtVT);
13613 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13614 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13615 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13616 V2 = getOnesVector(ExtVT, DAG, DL);
13618 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13620 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13621 // i1 was sign extended we can use X86ISD::CVT2MASK.
13622 int NumElems = VT.getVectorNumElements();
13623 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13624 (Subtarget.hasDQI() && (NumElems < 32)))
13625 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13627 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13630 /// Helper function that returns true if the shuffle mask should be
13631 /// commuted to improve canonicalization.
13632 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13633 int NumElements = Mask.size();
13635 int NumV1Elements = 0, NumV2Elements = 0;
13639 else if (M < NumElements)
13644 // Commute the shuffle as needed such that more elements come from V1 than
13645 // V2. This allows us to match the shuffle pattern strictly on how many
13646 // elements come from V1 without handling the symmetric cases.
13647 if (NumV2Elements > NumV1Elements)
13650 assert(NumV1Elements > 0 && "No V1 indices");
13652 if (NumV2Elements == 0)
13655 // When the number of V1 and V2 elements are the same, try to minimize the
13656 // number of uses of V2 in the low half of the vector. When that is tied,
13657 // ensure that the sum of indices for V1 is equal to or lower than the sum
13658 // indices for V2. When those are equal, try to ensure that the number of odd
13659 // indices for V1 is lower than the number of odd indices for V2.
13660 if (NumV1Elements == NumV2Elements) {
13661 int LowV1Elements = 0, LowV2Elements = 0;
13662 for (int M : Mask.slice(0, NumElements / 2))
13663 if (M >= NumElements)
13667 if (LowV2Elements > LowV1Elements)
13669 if (LowV2Elements == LowV1Elements) {
13670 int SumV1Indices = 0, SumV2Indices = 0;
13671 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13672 if (Mask[i] >= NumElements)
13674 else if (Mask[i] >= 0)
13676 if (SumV2Indices < SumV1Indices)
13678 if (SumV2Indices == SumV1Indices) {
13679 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13680 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13681 if (Mask[i] >= NumElements)
13682 NumV2OddIndices += i % 2;
13683 else if (Mask[i] >= 0)
13684 NumV1OddIndices += i % 2;
13685 if (NumV2OddIndices < NumV1OddIndices)
13694 /// \brief Top-level lowering for x86 vector shuffles.
13696 /// This handles decomposition, canonicalization, and lowering of all x86
13697 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13698 /// above in helper routines. The canonicalization attempts to widen shuffles
13699 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13700 /// s.t. only one of the two inputs needs to be tested, etc.
13701 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13702 SelectionDAG &DAG) {
13703 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13704 ArrayRef<int> Mask = SVOp->getMask();
13705 SDValue V1 = Op.getOperand(0);
13706 SDValue V2 = Op.getOperand(1);
13707 MVT VT = Op.getSimpleValueType();
13708 int NumElements = VT.getVectorNumElements();
13710 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13712 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13713 "Can't lower MMX shuffles");
13715 bool V1IsUndef = V1.isUndef();
13716 bool V2IsUndef = V2.isUndef();
13717 if (V1IsUndef && V2IsUndef)
13718 return DAG.getUNDEF(VT);
13720 // When we create a shuffle node we put the UNDEF node to second operand,
13721 // but in some cases the first operand may be transformed to UNDEF.
13722 // In this case we should just commute the node.
13724 return DAG.getCommutedVectorShuffle(*SVOp);
13726 // Check for non-undef masks pointing at an undef vector and make the masks
13727 // undef as well. This makes it easier to match the shuffle based solely on
13731 if (M >= NumElements) {
13732 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13733 for (int &M : NewMask)
13734 if (M >= NumElements)
13736 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13739 // Check for illegal shuffle mask element index values.
13740 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13741 assert(llvm::all_of(Mask,
13742 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13743 "Out of bounds shuffle index");
13745 // We actually see shuffles that are entirely re-arrangements of a set of
13746 // zero inputs. This mostly happens while decomposing complex shuffles into
13747 // simple ones. Directly lower these as a buildvector of zeros.
13748 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13749 if (Zeroable.isAllOnesValue())
13750 return getZeroVector(VT, Subtarget, DAG, DL);
13752 // Try to collapse shuffles into using a vector type with fewer elements but
13753 // wider element types. We cap this to not form integers or floating point
13754 // elements wider than 64 bits, but it might be interesting to form i128
13755 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13756 SmallVector<int, 16> WidenedMask;
13757 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13758 canWidenShuffleElements(Mask, WidenedMask)) {
13759 MVT NewEltVT = VT.isFloatingPoint()
13760 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13761 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13762 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13763 // Make sure that the new vector type is legal. For example, v2f64 isn't
13765 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13766 V1 = DAG.getBitcast(NewVT, V1);
13767 V2 = DAG.getBitcast(NewVT, V2);
13768 return DAG.getBitcast(
13769 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13773 // Commute the shuffle if it will improve canonicalization.
13774 if (canonicalizeShuffleMaskWithCommute(Mask))
13775 return DAG.getCommutedVectorShuffle(*SVOp);
13777 // For each vector width, delegate to a specialized lowering routine.
13778 if (VT.is128BitVector())
13779 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13782 if (VT.is256BitVector())
13783 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13786 if (VT.is512BitVector())
13787 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13791 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13793 llvm_unreachable("Unimplemented!");
13796 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13797 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13798 const X86Subtarget &Subtarget,
13799 SelectionDAG &DAG) {
13800 SDValue Cond = Op.getOperand(0);
13801 SDValue LHS = Op.getOperand(1);
13802 SDValue RHS = Op.getOperand(2);
13804 MVT VT = Op.getSimpleValueType();
13806 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13808 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13810 // Only non-legal VSELECTs reach this lowering, convert those into generic
13811 // shuffles and re-use the shuffle lowering path for blends.
13812 SmallVector<int, 32> Mask;
13813 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13814 SDValue CondElt = CondBV->getOperand(i);
13816 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13819 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13822 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13823 // A vselect where all conditions and data are constants can be optimized into
13824 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13825 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13826 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13827 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13830 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13831 // with patterns on the mask registers on AVX-512.
13832 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13835 // Try to lower this to a blend-style vector shuffle. This can handle all
13836 // constant condition cases.
13837 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13840 // Variable blends are only legal from SSE4.1 onward.
13841 if (!Subtarget.hasSSE41())
13845 MVT VT = Op.getSimpleValueType();
13847 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13848 // into an i1 condition so that we can use the mask-based 512-bit blend
13850 if (VT.getSizeInBits() == 512) {
13851 SDValue Cond = Op.getOperand(0);
13852 // The vNi1 condition case should be handled above as it can be trivially
13854 assert(Cond.getValueType().getScalarSizeInBits() ==
13855 VT.getScalarSizeInBits() &&
13856 "Should have a size-matched integer condition!");
13857 // Build a mask by testing the condition against itself (tests for zero).
13858 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
13859 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
13860 // Now return a new VSELECT using the mask.
13861 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
13864 // Only some types will be legal on some subtargets. If we can emit a legal
13865 // VSELECT-matching blend, return Op, and but if we need to expand, return
13867 switch (VT.SimpleTy) {
13869 // Most of the vector types have blends past SSE4.1.
13873 // The byte blends for AVX vectors were introduced only in AVX2.
13874 if (Subtarget.hasAVX2())
13881 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13882 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13885 // FIXME: We should custom lower this by fixing the condition and using i8
13891 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13892 MVT VT = Op.getSimpleValueType();
13895 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13898 if (VT.getSizeInBits() == 8) {
13899 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13900 Op.getOperand(0), Op.getOperand(1));
13901 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13902 DAG.getValueType(VT));
13903 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13906 if (VT == MVT::f32) {
13907 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13908 // the result back to FR32 register. It's only worth matching if the
13909 // result has a single use which is a store or a bitcast to i32. And in
13910 // the case of a store, it's not worth it if the index is a constant 0,
13911 // because a MOVSSmr can be used instead, which is smaller and faster.
13912 if (!Op.hasOneUse())
13914 SDNode *User = *Op.getNode()->use_begin();
13915 if ((User->getOpcode() != ISD::STORE ||
13916 isNullConstant(Op.getOperand(1))) &&
13917 (User->getOpcode() != ISD::BITCAST ||
13918 User->getValueType(0) != MVT::i32))
13920 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13921 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13923 return DAG.getBitcast(MVT::f32, Extract);
13926 if (VT == MVT::i32 || VT == MVT::i64) {
13927 // ExtractPS/pextrq works with constant index.
13928 if (isa<ConstantSDNode>(Op.getOperand(1)))
13935 /// Extract one bit from mask vector, like v16i1 or v8i1.
13936 /// AVX-512 feature.
13938 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13939 SDValue Vec = Op.getOperand(0);
13941 MVT VecVT = Vec.getSimpleValueType();
13942 SDValue Idx = Op.getOperand(1);
13943 MVT EltVT = Op.getSimpleValueType();
13945 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13946 "Unexpected vector type in ExtractBitFromMaskVector");
13948 // variable index can't be handled in mask registers,
13949 // extend vector to VR512/128
13950 if (!isa<ConstantSDNode>(Idx)) {
13951 unsigned NumElts = VecVT.getVectorNumElements();
13952 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
13953 // than extending to 128/256bit.
13954 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
13955 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
13956 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
13957 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13958 ExtVT.getVectorElementType(), Ext, Idx);
13959 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13962 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13963 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13964 (VecVT.getVectorNumElements() < 8)) {
13965 // Use kshiftlw/rw instruction.
13966 VecVT = MVT::v16i1;
13967 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13968 DAG.getUNDEF(VecVT),
13970 DAG.getIntPtrConstant(0, dl));
13972 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13973 if (MaxSift - IdxVal)
13974 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
13975 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13976 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
13977 DAG.getConstant(MaxSift, dl, MVT::i8));
13978 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
13979 DAG.getIntPtrConstant(0, dl));
13983 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13984 SelectionDAG &DAG) const {
13986 SDValue Vec = Op.getOperand(0);
13987 MVT VecVT = Vec.getSimpleValueType();
13988 SDValue Idx = Op.getOperand(1);
13990 if (VecVT.getVectorElementType() == MVT::i1)
13991 return ExtractBitFromMaskVector(Op, DAG);
13993 if (!isa<ConstantSDNode>(Idx)) {
13994 // Its more profitable to go through memory (1 cycles throughput)
13995 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
13996 // IACA tool was used to get performance estimation
13997 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
13999 // example : extractelement <16 x i8> %a, i32 %i
14001 // Block Throughput: 3.00 Cycles
14002 // Throughput Bottleneck: Port5
14004 // | Num Of | Ports pressure in cycles | |
14005 // | Uops | 0 - DV | 5 | 6 | 7 | |
14006 // ---------------------------------------------
14007 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14008 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14009 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14010 // Total Num Of Uops: 4
14013 // Block Throughput: 1.00 Cycles
14014 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14016 // | | Ports pressure in cycles | |
14017 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14018 // ---------------------------------------------------------
14019 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14020 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14021 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14022 // Total Num Of Uops: 4
14027 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14029 // If this is a 256-bit vector result, first extract the 128-bit vector and
14030 // then extract the element from the 128-bit vector.
14031 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14032 // Get the 128-bit vector.
14033 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14034 MVT EltVT = VecVT.getVectorElementType();
14036 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14037 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14039 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14040 // this can be done with a mask.
14041 IdxVal &= ElemsPerChunk - 1;
14042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14043 DAG.getConstant(IdxVal, dl, MVT::i32));
14046 assert(VecVT.is128BitVector() && "Unexpected vector length");
14048 MVT VT = Op.getSimpleValueType();
14050 if (VT.getSizeInBits() == 16) {
14051 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14052 // we're going to zero extend the register or fold the store (SSE41 only).
14053 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14054 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14055 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14056 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14057 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14059 // Transform it so it match pextrw which produces a 32-bit result.
14060 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14061 Op.getOperand(0), Op.getOperand(1));
14062 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14063 DAG.getValueType(VT));
14064 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14067 if (Subtarget.hasSSE41())
14068 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14071 // TODO: We only extract a single element from v16i8, we can probably afford
14072 // to be more aggressive here before using the default approach of spilling to
14074 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14075 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14076 int DWordIdx = IdxVal / 4;
14077 if (DWordIdx == 0) {
14078 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14079 DAG.getBitcast(MVT::v4i32, Vec),
14080 DAG.getIntPtrConstant(DWordIdx, dl));
14081 int ShiftVal = (IdxVal % 4) * 8;
14083 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14084 DAG.getConstant(ShiftVal, dl, MVT::i32));
14085 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14088 int WordIdx = IdxVal / 2;
14089 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14090 DAG.getBitcast(MVT::v8i16, Vec),
14091 DAG.getIntPtrConstant(WordIdx, dl));
14092 int ShiftVal = (IdxVal % 2) * 8;
14094 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14095 DAG.getConstant(ShiftVal, dl, MVT::i16));
14096 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14099 if (VT.getSizeInBits() == 32) {
14103 // SHUFPS the element to the lowest double word, then movss.
14104 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14105 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14106 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14107 DAG.getIntPtrConstant(0, dl));
14110 if (VT.getSizeInBits() == 64) {
14111 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14112 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14113 // to match extract_elt for f64.
14117 // UNPCKHPD the element to the lowest double word, then movsd.
14118 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14119 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14120 int Mask[2] = { 1, -1 };
14121 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14122 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14123 DAG.getIntPtrConstant(0, dl));
14129 /// Insert one bit to mask vector, like v16i1 or v8i1.
14130 /// AVX-512 feature.
14132 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14134 SDValue Vec = Op.getOperand(0);
14135 SDValue Elt = Op.getOperand(1);
14136 SDValue Idx = Op.getOperand(2);
14137 MVT VecVT = Vec.getSimpleValueType();
14139 if (!isa<ConstantSDNode>(Idx)) {
14140 // Non constant index. Extend source and destination,
14141 // insert element and then truncate the result.
14142 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14143 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14144 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14145 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14146 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14147 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14150 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14151 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14152 unsigned NumElems = VecVT.getVectorNumElements();
14154 if(Vec.isUndef()) {
14156 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14157 DAG.getConstant(IdxVal, dl, MVT::i8));
14161 // Insertion of one bit into first position
14162 if (IdxVal == 0 ) {
14163 // Clean top bits of vector.
14164 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14165 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14166 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14167 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14168 // Clean the first bit in source vector.
14169 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14170 DAG.getConstant(1 , dl, MVT::i8));
14171 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14172 DAG.getConstant(1, dl, MVT::i8));
14174 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14176 // Insertion of one bit into last position
14177 if (IdxVal == NumElems -1) {
14178 // Move the bit to the last position inside the vector.
14179 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14180 DAG.getConstant(IdxVal, dl, MVT::i8));
14181 // Clean the last bit in the source vector.
14182 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14183 DAG.getConstant(1, dl, MVT::i8));
14184 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14185 DAG.getConstant(1 , dl, MVT::i8));
14187 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14190 // Use shuffle to insert element.
14191 SmallVector<int, 64> MaskVec(NumElems);
14192 for (unsigned i = 0; i != NumElems; ++i)
14193 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14195 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14198 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14199 SelectionDAG &DAG) const {
14200 MVT VT = Op.getSimpleValueType();
14201 MVT EltVT = VT.getVectorElementType();
14202 unsigned NumElts = VT.getVectorNumElements();
14204 if (EltVT == MVT::i1)
14205 return InsertBitToMaskVector(Op, DAG);
14208 SDValue N0 = Op.getOperand(0);
14209 SDValue N1 = Op.getOperand(1);
14210 SDValue N2 = Op.getOperand(2);
14211 if (!isa<ConstantSDNode>(N2))
14213 auto *N2C = cast<ConstantSDNode>(N2);
14214 unsigned IdxVal = N2C->getZExtValue();
14216 bool IsZeroElt = X86::isZeroNode(N1);
14217 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14219 // If we are inserting a element, see if we can do this more efficiently with
14220 // a blend shuffle with a rematerializable vector than a costly integer
14222 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14223 // be beneficial if we are inserting several zeros and can combine the masks.
14224 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14225 SmallVector<int, 8> BlendMask;
14226 for (unsigned i = 0; i != NumElts; ++i)
14227 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14228 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14229 : DAG.getConstant(-1, dl, VT);
14230 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14233 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14234 // into that, and then insert the subvector back into the result.
14235 if (VT.is256BitVector() || VT.is512BitVector()) {
14236 // With a 256-bit vector, we can insert into the zero element efficiently
14237 // using a blend if we have AVX or AVX2 and the right data type.
14238 if (VT.is256BitVector() && IdxVal == 0) {
14239 // TODO: It is worthwhile to cast integer to floating point and back
14240 // and incur a domain crossing penalty if that's what we'll end up
14241 // doing anyway after extracting to a 128-bit vector.
14242 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14243 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14244 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14245 N2 = DAG.getIntPtrConstant(1, dl);
14246 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14250 // Get the desired 128-bit vector chunk.
14251 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14253 // Insert the element into the desired chunk.
14254 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14255 assert(isPowerOf2_32(NumEltsIn128));
14256 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14257 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14259 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14260 DAG.getConstant(IdxIn128, dl, MVT::i32));
14262 // Insert the changed part back into the bigger vector
14263 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14265 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14267 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14268 // argument. SSE41 required for pinsrb.
14269 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14271 if (VT == MVT::v8i16) {
14272 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14273 Opc = X86ISD::PINSRW;
14275 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14276 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14277 Opc = X86ISD::PINSRB;
14280 if (N1.getValueType() != MVT::i32)
14281 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14282 if (N2.getValueType() != MVT::i32)
14283 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14284 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14287 if (Subtarget.hasSSE41()) {
14288 if (EltVT == MVT::f32) {
14289 // Bits [7:6] of the constant are the source select. This will always be
14290 // zero here. The DAG Combiner may combine an extract_elt index into
14291 // these bits. For example (insert (extract, 3), 2) could be matched by
14292 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14293 // Bits [5:4] of the constant are the destination select. This is the
14294 // value of the incoming immediate.
14295 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14296 // combine either bitwise AND or insert of float 0.0 to set these bits.
14298 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14299 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14300 // If this is an insertion of 32-bits into the low 32-bits of
14301 // a vector, we prefer to generate a blend with immediate rather
14302 // than an insertps. Blends are simpler operations in hardware and so
14303 // will always have equal or better performance than insertps.
14304 // But if optimizing for size and there's a load folding opportunity,
14305 // generate insertps because blendps does not have a 32-bit memory
14307 N2 = DAG.getIntPtrConstant(1, dl);
14308 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14309 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14311 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14312 // Create this as a scalar to vector..
14313 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14314 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14317 // PINSR* works with constant index.
14318 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14325 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14326 SelectionDAG &DAG) {
14328 MVT OpVT = Op.getSimpleValueType();
14330 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14332 if (X86::isZeroNode(Op.getOperand(0)))
14333 return getZeroVector(OpVT, Subtarget, DAG, dl);
14335 // If this is a 256-bit vector result, first insert into a 128-bit
14336 // vector and then insert into the 256-bit vector.
14337 if (!OpVT.is128BitVector()) {
14338 // Insert into a 128-bit vector.
14339 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14340 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14341 OpVT.getVectorNumElements() / SizeFactor);
14343 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14345 // Insert the 128-bit vector.
14346 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14348 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14350 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14351 if (OpVT == MVT::v4i32)
14354 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14355 return DAG.getBitcast(
14356 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14359 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14360 // a simple subregister reference or explicit instructions to grab
14361 // upper bits of a vector.
14362 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14363 SelectionDAG &DAG) {
14364 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14367 SDValue In = Op.getOperand(0);
14368 SDValue Idx = Op.getOperand(1);
14369 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14370 MVT ResVT = Op.getSimpleValueType();
14372 assert((In.getSimpleValueType().is256BitVector() ||
14373 In.getSimpleValueType().is512BitVector()) &&
14374 "Can only extract from 256-bit or 512-bit vectors");
14376 // If the input is a buildvector just emit a smaller one.
14377 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14378 if (In.getOpcode() == ISD::BUILD_VECTOR)
14379 return DAG.getBuildVector(
14380 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14382 // Everything else is legal.
14386 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14387 // simple superregister reference or explicit instructions to insert
14388 // the upper bits of a vector.
14389 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14390 SelectionDAG &DAG) {
14391 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14393 return insert1BitVector(Op, DAG, Subtarget);
14396 // Returns the appropriate wrapper opcode for a global reference.
14397 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14398 // References to absolute symbols are never PC-relative.
14399 if (GV && GV->isAbsoluteSymbolRef())
14400 return X86ISD::Wrapper;
14402 CodeModel::Model M = getTargetMachine().getCodeModel();
14403 if (Subtarget.isPICStyleRIPRel() &&
14404 (M == CodeModel::Small || M == CodeModel::Kernel))
14405 return X86ISD::WrapperRIP;
14407 return X86ISD::Wrapper;
14410 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14411 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14412 // one of the above mentioned nodes. It has to be wrapped because otherwise
14413 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14414 // be used to form addressing mode. These wrapped nodes will be selected
14417 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14418 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14420 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14421 // global base reg.
14422 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14424 auto PtrVT = getPointerTy(DAG.getDataLayout());
14425 SDValue Result = DAG.getTargetConstantPool(
14426 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14428 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14429 // With PIC, the address is actually $g + Offset.
14432 DAG.getNode(ISD::ADD, DL, PtrVT,
14433 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14439 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14440 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14442 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14443 // global base reg.
14444 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14446 auto PtrVT = getPointerTy(DAG.getDataLayout());
14447 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14449 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14451 // With PIC, the address is actually $g + Offset.
14454 DAG.getNode(ISD::ADD, DL, PtrVT,
14455 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14461 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14462 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14464 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14465 // global base reg.
14466 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14467 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14469 auto PtrVT = getPointerTy(DAG.getDataLayout());
14470 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14473 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14475 // With PIC, the address is actually $g + Offset.
14476 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14478 DAG.getNode(ISD::ADD, DL, PtrVT,
14479 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14482 // For symbols that require a load from a stub to get the address, emit the
14484 if (isGlobalStubReference(OpFlag))
14485 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14486 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14492 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14493 // Create the TargetBlockAddressAddress node.
14494 unsigned char OpFlags =
14495 Subtarget.classifyBlockAddressReference();
14496 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14497 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14499 auto PtrVT = getPointerTy(DAG.getDataLayout());
14500 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14501 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14503 // With PIC, the address is actually $g + Offset.
14504 if (isGlobalRelativeToPICBase(OpFlags)) {
14505 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14506 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14512 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14513 const SDLoc &dl, int64_t Offset,
14514 SelectionDAG &DAG) const {
14515 // Create the TargetGlobalAddress node, folding in the constant
14516 // offset if it is legal.
14517 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14518 CodeModel::Model M = DAG.getTarget().getCodeModel();
14519 auto PtrVT = getPointerTy(DAG.getDataLayout());
14521 if (OpFlags == X86II::MO_NO_FLAG &&
14522 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14523 // A direct static reference to a global.
14524 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14527 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14530 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14532 // With PIC, the address is actually $g + Offset.
14533 if (isGlobalRelativeToPICBase(OpFlags)) {
14534 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14535 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14538 // For globals that require a load from a stub to get the address, emit the
14540 if (isGlobalStubReference(OpFlags))
14541 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14542 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14544 // If there was a non-zero offset that we didn't fold, create an explicit
14545 // addition for it.
14547 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14548 DAG.getConstant(Offset, dl, PtrVT));
14554 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14555 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14556 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14557 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14561 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14562 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14563 unsigned char OperandFlags, bool LocalDynamic = false) {
14564 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14565 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14567 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14568 GA->getValueType(0),
14572 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14576 SDValue Ops[] = { Chain, TGA, *InFlag };
14577 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14579 SDValue Ops[] = { Chain, TGA };
14580 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14583 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14584 MFI.setAdjustsStack(true);
14585 MFI.setHasCalls(true);
14587 SDValue Flag = Chain.getValue(1);
14588 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14591 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14593 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14596 SDLoc dl(GA); // ? function entry point might be better
14597 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14598 DAG.getNode(X86ISD::GlobalBaseReg,
14599 SDLoc(), PtrVT), InFlag);
14600 InFlag = Chain.getValue(1);
14602 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14605 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14607 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14609 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14610 X86::RAX, X86II::MO_TLSGD);
14613 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14619 // Get the start address of the TLS block for this module.
14620 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14621 .getInfo<X86MachineFunctionInfo>();
14622 MFI->incNumLocalDynamicTLSAccesses();
14626 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14627 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14630 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14631 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14632 InFlag = Chain.getValue(1);
14633 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14634 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14637 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14641 unsigned char OperandFlags = X86II::MO_DTPOFF;
14642 unsigned WrapperKind = X86ISD::Wrapper;
14643 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14644 GA->getValueType(0),
14645 GA->getOffset(), OperandFlags);
14646 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14648 // Add x@dtpoff with the base.
14649 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14652 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14653 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14654 const EVT PtrVT, TLSModel::Model model,
14655 bool is64Bit, bool isPIC) {
14658 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14659 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14660 is64Bit ? 257 : 256));
14662 SDValue ThreadPointer =
14663 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14664 MachinePointerInfo(Ptr));
14666 unsigned char OperandFlags = 0;
14667 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14669 unsigned WrapperKind = X86ISD::Wrapper;
14670 if (model == TLSModel::LocalExec) {
14671 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14672 } else if (model == TLSModel::InitialExec) {
14674 OperandFlags = X86II::MO_GOTTPOFF;
14675 WrapperKind = X86ISD::WrapperRIP;
14677 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14680 llvm_unreachable("Unexpected model");
14683 // emit "addl x@ntpoff,%eax" (local exec)
14684 // or "addl x@indntpoff,%eax" (initial exec)
14685 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14687 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14688 GA->getOffset(), OperandFlags);
14689 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14691 if (model == TLSModel::InitialExec) {
14692 if (isPIC && !is64Bit) {
14693 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14694 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14698 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14699 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14702 // The address of the thread local variable is the add of the thread
14703 // pointer with the offset of the variable.
14704 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14708 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14710 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14712 if (DAG.getTarget().Options.EmulatedTLS)
14713 return LowerToTLSEmulatedModel(GA, DAG);
14715 const GlobalValue *GV = GA->getGlobal();
14716 auto PtrVT = getPointerTy(DAG.getDataLayout());
14717 bool PositionIndependent = isPositionIndependent();
14719 if (Subtarget.isTargetELF()) {
14720 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14722 case TLSModel::GeneralDynamic:
14723 if (Subtarget.is64Bit())
14724 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14725 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14726 case TLSModel::LocalDynamic:
14727 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14728 Subtarget.is64Bit());
14729 case TLSModel::InitialExec:
14730 case TLSModel::LocalExec:
14731 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14732 PositionIndependent);
14734 llvm_unreachable("Unknown TLS model.");
14737 if (Subtarget.isTargetDarwin()) {
14738 // Darwin only has one model of TLS. Lower to that.
14739 unsigned char OpFlag = 0;
14740 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14741 X86ISD::WrapperRIP : X86ISD::Wrapper;
14743 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14744 // global base reg.
14745 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14747 OpFlag = X86II::MO_TLVP_PIC_BASE;
14749 OpFlag = X86II::MO_TLVP;
14751 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14752 GA->getValueType(0),
14753 GA->getOffset(), OpFlag);
14754 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14756 // With PIC32, the address is actually $g + Offset.
14758 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14759 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14762 // Lowering the machine isd will make sure everything is in the right
14764 SDValue Chain = DAG.getEntryNode();
14765 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14766 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14767 SDValue Args[] = { Chain, Offset };
14768 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14769 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14770 DAG.getIntPtrConstant(0, DL, true),
14771 Chain.getValue(1), DL);
14773 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14774 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14775 MFI.setAdjustsStack(true);
14777 // And our return value (tls address) is in the standard call return value
14779 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14780 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14783 if (Subtarget.isTargetKnownWindowsMSVC() ||
14784 Subtarget.isTargetWindowsItanium() ||
14785 Subtarget.isTargetWindowsGNU()) {
14786 // Just use the implicit TLS architecture
14787 // Need to generate something similar to:
14788 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14790 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14791 // mov rcx, qword [rdx+rcx*8]
14792 // mov eax, .tls$:tlsvar
14793 // [rax+rcx] contains the address
14794 // Windows 64bit: gs:0x58
14795 // Windows 32bit: fs:__tls_array
14798 SDValue Chain = DAG.getEntryNode();
14800 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14801 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14802 // use its literal value of 0x2C.
14803 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14804 ? Type::getInt8PtrTy(*DAG.getContext(),
14806 : Type::getInt32PtrTy(*DAG.getContext(),
14809 SDValue TlsArray = Subtarget.is64Bit()
14810 ? DAG.getIntPtrConstant(0x58, dl)
14811 : (Subtarget.isTargetWindowsGNU()
14812 ? DAG.getIntPtrConstant(0x2C, dl)
14813 : DAG.getExternalSymbol("_tls_array", PtrVT));
14815 SDValue ThreadPointer =
14816 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14819 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14820 res = ThreadPointer;
14822 // Load the _tls_index variable
14823 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14824 if (Subtarget.is64Bit())
14825 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14826 MachinePointerInfo(), MVT::i32);
14828 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14830 auto &DL = DAG.getDataLayout();
14832 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14833 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14835 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14838 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14840 // Get the offset of start of .tls section
14841 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14842 GA->getValueType(0),
14843 GA->getOffset(), X86II::MO_SECREL);
14844 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14846 // The address of the thread local variable is the add of the thread
14847 // pointer with the offset of the variable.
14848 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14851 llvm_unreachable("TLS not implemented for this target.");
14854 /// Lower SRA_PARTS and friends, which return two i32 values
14855 /// and take a 2 x i32 value to shift plus a shift amount.
14856 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14857 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14858 MVT VT = Op.getSimpleValueType();
14859 unsigned VTBits = VT.getSizeInBits();
14861 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14862 SDValue ShOpLo = Op.getOperand(0);
14863 SDValue ShOpHi = Op.getOperand(1);
14864 SDValue ShAmt = Op.getOperand(2);
14865 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14866 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14868 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14869 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14870 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14871 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14872 : DAG.getConstant(0, dl, VT);
14874 SDValue Tmp2, Tmp3;
14875 if (Op.getOpcode() == ISD::SHL_PARTS) {
14876 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14877 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14879 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14880 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14883 // If the shift amount is larger or equal than the width of a part we can't
14884 // rely on the results of shld/shrd. Insert a test and select the appropriate
14885 // values for large shift amounts.
14886 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14887 DAG.getConstant(VTBits, dl, MVT::i8));
14888 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14889 AndNode, DAG.getConstant(0, dl, MVT::i8));
14892 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14893 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14894 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14896 if (Op.getOpcode() == ISD::SHL_PARTS) {
14897 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14898 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14900 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14901 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14904 SDValue Ops[2] = { Lo, Hi };
14905 return DAG.getMergeValues(Ops, dl);
14908 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14909 SelectionDAG &DAG) const {
14910 SDValue Src = Op.getOperand(0);
14911 MVT SrcVT = Src.getSimpleValueType();
14912 MVT VT = Op.getSimpleValueType();
14915 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14916 if (SrcVT.isVector()) {
14917 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14918 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14919 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14920 DAG.getUNDEF(SrcVT)));
14922 if (SrcVT.getVectorElementType() == MVT::i1) {
14923 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14924 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14925 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14926 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14927 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14928 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14933 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14934 "Unknown SINT_TO_FP to lower!");
14936 // These are really Legal; return the operand so the caller accepts it as
14938 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14940 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14941 Subtarget.is64Bit()) {
14945 SDValue ValueToStore = Op.getOperand(0);
14946 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14947 !Subtarget.is64Bit())
14948 // Bitcasting to f64 here allows us to do a single 64-bit store from
14949 // an SSE register, avoiding the store forwarding penalty that would come
14950 // with two 32-bit stores.
14951 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14953 unsigned Size = SrcVT.getSizeInBits()/8;
14954 MachineFunction &MF = DAG.getMachineFunction();
14955 auto PtrVT = getPointerTy(MF.getDataLayout());
14956 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14957 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14958 SDValue Chain = DAG.getStore(
14959 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14960 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14961 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14964 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14966 SelectionDAG &DAG) const {
14970 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14972 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14974 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14976 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14978 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14979 MachineMemOperand *MMO;
14981 int SSFI = FI->getIndex();
14982 MMO = DAG.getMachineFunction().getMachineMemOperand(
14983 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14984 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14986 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14987 StackSlot = StackSlot.getOperand(1);
14989 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14990 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14992 Tys, Ops, SrcVT, MMO);
14995 Chain = Result.getValue(1);
14996 SDValue InFlag = Result.getValue(2);
14998 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14999 // shouldn't be necessary except that RFP cannot be live across
15000 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15001 MachineFunction &MF = DAG.getMachineFunction();
15002 unsigned SSFISize = Op.getValueSizeInBits()/8;
15003 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15004 auto PtrVT = getPointerTy(MF.getDataLayout());
15005 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15006 Tys = DAG.getVTList(MVT::Other);
15008 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15010 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15011 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15012 MachineMemOperand::MOStore, SSFISize, SSFISize);
15014 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15015 Ops, Op.getValueType(), MMO);
15016 Result = DAG.getLoad(
15017 Op.getValueType(), DL, Chain, StackSlot,
15018 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15024 /// 64-bit unsigned integer to double expansion.
15025 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15026 SelectionDAG &DAG) const {
15027 // This algorithm is not obvious. Here it is what we're trying to output:
15030 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15031 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15033 haddpd %xmm0, %xmm0
15035 pshufd $0x4e, %xmm0, %xmm1
15041 LLVMContext *Context = DAG.getContext();
15043 // Build some magic constants.
15044 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15045 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15046 auto PtrVT = getPointerTy(DAG.getDataLayout());
15047 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15049 SmallVector<Constant*,2> CV1;
15051 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15052 APInt(64, 0x4330000000000000ULL))));
15054 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15055 APInt(64, 0x4530000000000000ULL))));
15056 Constant *C1 = ConstantVector::get(CV1);
15057 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15059 // Load the 64-bit value into an XMM register.
15060 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15063 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15064 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15065 /* Alignment = */ 16);
15067 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15070 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15071 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15072 /* Alignment = */ 16);
15073 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15074 // TODO: Are there any fast-math-flags to propagate here?
15075 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15078 if (Subtarget.hasSSE3()) {
15079 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15080 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15082 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15083 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15084 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15085 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15089 DAG.getIntPtrConstant(0, dl));
15092 /// 32-bit unsigned integer to float expansion.
15093 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15094 SelectionDAG &DAG) const {
15096 // FP constant to bias correct the final result.
15097 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15100 // Load the 32-bit value into an XMM register.
15101 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15104 // Zero out the upper parts of the register.
15105 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15107 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15108 DAG.getBitcast(MVT::v2f64, Load),
15109 DAG.getIntPtrConstant(0, dl));
15111 // Or the load with the bias.
15112 SDValue Or = DAG.getNode(
15113 ISD::OR, dl, MVT::v2i64,
15114 DAG.getBitcast(MVT::v2i64,
15115 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15116 DAG.getBitcast(MVT::v2i64,
15117 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15119 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15120 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15122 // Subtract the bias.
15123 // TODO: Are there any fast-math-flags to propagate here?
15124 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15126 // Handle final rounding.
15127 MVT DestVT = Op.getSimpleValueType();
15129 if (DestVT.bitsLT(MVT::f64))
15130 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15131 DAG.getIntPtrConstant(0, dl));
15132 if (DestVT.bitsGT(MVT::f64))
15133 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15135 // Handle final rounding.
15139 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15140 const X86Subtarget &Subtarget, SDLoc &DL) {
15141 if (Op.getSimpleValueType() != MVT::v2f64)
15144 SDValue N0 = Op.getOperand(0);
15145 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15147 // Legalize to v4i32 type.
15148 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15149 DAG.getUNDEF(MVT::v2i32));
15151 if (Subtarget.hasAVX512())
15152 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15154 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15155 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15156 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15157 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15159 // Two to the power of half-word-size.
15160 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15162 // Clear upper part of LO, lower HI.
15163 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15164 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15166 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15167 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15168 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15170 // Add the two halves.
15171 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15174 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15175 const X86Subtarget &Subtarget) {
15176 // The algorithm is the following:
15177 // #ifdef __SSE4_1__
15178 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15179 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15180 // (uint4) 0x53000000, 0xaa);
15182 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15183 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15185 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15186 // return (float4) lo + fhi;
15188 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15189 // reassociate the two FADDs, and if we do that, the algorithm fails
15190 // spectacularly (PR24512).
15191 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15192 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15193 // there's also the MachineCombiner reassociations happening on Machine IR.
15194 if (DAG.getTarget().Options.UnsafeFPMath)
15198 SDValue V = Op->getOperand(0);
15199 MVT VecIntVT = V.getSimpleValueType();
15200 bool Is128 = VecIntVT == MVT::v4i32;
15201 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15202 // If we convert to something else than the supported type, e.g., to v4f64,
15204 if (VecFloatVT != Op->getSimpleValueType(0))
15207 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15208 "Unsupported custom type");
15210 // In the #idef/#else code, we have in common:
15211 // - The vector of constants:
15217 // Create the splat vector for 0x4b000000.
15218 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15219 // Create the splat vector for 0x53000000.
15220 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15222 // Create the right shift.
15223 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15224 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15227 if (Subtarget.hasSSE41()) {
15228 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15229 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15230 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15231 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15232 // Low will be bitcasted right away, so do not bother bitcasting back to its
15234 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15235 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15236 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15237 // (uint4) 0x53000000, 0xaa);
15238 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15239 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15240 // High will be bitcasted right away, so do not bother bitcasting back to
15241 // its original type.
15242 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15243 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15245 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15246 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15247 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15248 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15250 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15251 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15254 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15255 SDValue VecCstFAdd = DAG.getConstantFP(
15256 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15258 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15259 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15260 // TODO: Are there any fast-math-flags to propagate here?
15262 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15263 // return (float4) lo + fhi;
15264 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15265 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15268 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15269 SelectionDAG &DAG) const {
15270 SDValue N0 = Op.getOperand(0);
15271 MVT SrcVT = N0.getSimpleValueType();
15274 if (SrcVT.getVectorElementType() == MVT::i1) {
15275 if (SrcVT == MVT::v2i1)
15276 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15277 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15278 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15279 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15280 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15283 switch (SrcVT.SimpleTy) {
15285 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15290 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15291 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15292 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15295 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15298 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15301 assert(Subtarget.hasAVX512());
15302 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15303 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15307 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15308 SelectionDAG &DAG) const {
15309 SDValue N0 = Op.getOperand(0);
15311 auto PtrVT = getPointerTy(DAG.getDataLayout());
15313 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15314 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15315 // the optimization here.
15316 if (DAG.SignBitIsZero(N0))
15317 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15319 if (Op.getSimpleValueType().isVector())
15320 return lowerUINT_TO_FP_vec(Op, DAG);
15322 MVT SrcVT = N0.getSimpleValueType();
15323 MVT DstVT = Op.getSimpleValueType();
15325 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15326 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15327 // Conversions from unsigned i32 to f32/f64 are legal,
15328 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15332 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15333 return LowerUINT_TO_FP_i64(Op, DAG);
15334 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15335 return LowerUINT_TO_FP_i32(Op, DAG);
15336 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15339 // Make a 64-bit buffer, and use it to build an FILD.
15340 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15341 if (SrcVT == MVT::i32) {
15342 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15343 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15344 StackSlot, MachinePointerInfo());
15345 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15346 OffsetSlot, MachinePointerInfo());
15347 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15351 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15352 SDValue ValueToStore = Op.getOperand(0);
15353 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15354 // Bitcasting to f64 here allows us to do a single 64-bit store from
15355 // an SSE register, avoiding the store forwarding penalty that would come
15356 // with two 32-bit stores.
15357 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15358 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15359 MachinePointerInfo());
15360 // For i64 source, we need to add the appropriate power of 2 if the input
15361 // was negative. This is the same as the optimization in
15362 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15363 // we must be careful to do the computation in x87 extended precision, not
15364 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15365 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15366 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15367 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15368 MachineMemOperand::MOLoad, 8, 8);
15370 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15371 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15372 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15375 APInt FF(32, 0x5F800000ULL);
15377 // Check whether the sign bit is set.
15378 SDValue SignSet = DAG.getSetCC(
15379 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15380 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15382 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15383 SDValue FudgePtr = DAG.getConstantPool(
15384 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15386 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15387 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15388 SDValue Four = DAG.getIntPtrConstant(4, dl);
15389 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15390 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15392 // Load the value out, extending it from f32 to f80.
15393 // FIXME: Avoid the extend by constructing the right constant pool?
15394 SDValue Fudge = DAG.getExtLoad(
15395 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15396 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15397 /* Alignment = */ 4);
15398 // Extend everything to 80 bits to force it to be done on x87.
15399 // TODO: Are there any fast-math-flags to propagate here?
15400 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15401 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15402 DAG.getIntPtrConstant(0, dl));
15405 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15406 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15407 // just return an <SDValue(), SDValue()> pair.
15408 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15409 // to i16, i32 or i64, and we lower it to a legal sequence.
15410 // If lowered to the final integer result we return a <result, SDValue()> pair.
15411 // Otherwise we lower it to a sequence ending with a FIST, return a
15412 // <FIST, StackSlot> pair, and the caller is responsible for loading
15413 // the final integer result from StackSlot.
15414 std::pair<SDValue,SDValue>
15415 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15416 bool IsSigned, bool IsReplace) const {
15419 EVT DstTy = Op.getValueType();
15420 EVT TheVT = Op.getOperand(0).getValueType();
15421 auto PtrVT = getPointerTy(DAG.getDataLayout());
15423 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15424 // f16 must be promoted before using the lowering in this routine.
15425 // fp128 does not use this lowering.
15426 return std::make_pair(SDValue(), SDValue());
15429 // If using FIST to compute an unsigned i64, we'll need some fixup
15430 // to handle values above the maximum signed i64. A FIST is always
15431 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15432 bool UnsignedFixup = !IsSigned &&
15433 DstTy == MVT::i64 &&
15434 (!Subtarget.is64Bit() ||
15435 !isScalarFPTypeInSSEReg(TheVT));
15437 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15438 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15439 // The low 32 bits of the fist result will have the correct uint32 result.
15440 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15444 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15445 DstTy.getSimpleVT() >= MVT::i16 &&
15446 "Unknown FP_TO_INT to lower!");
15448 // These are really Legal.
15449 if (DstTy == MVT::i32 &&
15450 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15451 return std::make_pair(SDValue(), SDValue());
15452 if (Subtarget.is64Bit() &&
15453 DstTy == MVT::i64 &&
15454 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15455 return std::make_pair(SDValue(), SDValue());
15457 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15459 MachineFunction &MF = DAG.getMachineFunction();
15460 unsigned MemSize = DstTy.getSizeInBits()/8;
15461 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15462 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15465 switch (DstTy.getSimpleVT().SimpleTy) {
15466 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15467 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15468 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15469 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15472 SDValue Chain = DAG.getEntryNode();
15473 SDValue Value = Op.getOperand(0);
15474 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15476 if (UnsignedFixup) {
15478 // Conversion to unsigned i64 is implemented with a select,
15479 // depending on whether the source value fits in the range
15480 // of a signed i64. Let Thresh be the FP equivalent of
15481 // 0x8000000000000000ULL.
15483 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15484 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15485 // Fist-to-mem64 FistSrc
15486 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15487 // to XOR'ing the high 32 bits with Adjust.
15489 // Being a power of 2, Thresh is exactly representable in all FP formats.
15490 // For X87 we'd like to use the smallest FP type for this constant, but
15491 // for DAG type consistency we have to match the FP operand type.
15493 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15494 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15495 bool LosesInfo = false;
15496 if (TheVT == MVT::f64)
15497 // The rounding mode is irrelevant as the conversion should be exact.
15498 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15500 else if (TheVT == MVT::f80)
15501 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15502 APFloat::rmNearestTiesToEven, &LosesInfo);
15504 assert(Status == APFloat::opOK && !LosesInfo &&
15505 "FP conversion should have been exact");
15507 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15509 SDValue Cmp = DAG.getSetCC(DL,
15510 getSetCCResultType(DAG.getDataLayout(),
15511 *DAG.getContext(), TheVT),
15512 Value, ThreshVal, ISD::SETLT);
15513 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15514 DAG.getConstant(0, DL, MVT::i32),
15515 DAG.getConstant(0x80000000, DL, MVT::i32));
15516 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15517 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15518 *DAG.getContext(), TheVT),
15519 Value, ThreshVal, ISD::SETLT);
15520 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15523 // FIXME This causes a redundant load/store if the SSE-class value is already
15524 // in memory, such as if it is on the callstack.
15525 if (isScalarFPTypeInSSEReg(TheVT)) {
15526 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15527 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15528 MachinePointerInfo::getFixedStack(MF, SSFI));
15529 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15531 Chain, StackSlot, DAG.getValueType(TheVT)
15534 MachineMemOperand *MMO =
15535 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15536 MachineMemOperand::MOLoad, MemSize, MemSize);
15537 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15538 Chain = Value.getValue(1);
15539 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15540 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15543 MachineMemOperand *MMO =
15544 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15545 MachineMemOperand::MOStore, MemSize, MemSize);
15547 if (UnsignedFixup) {
15549 // Insert the FIST, load its result as two i32's,
15550 // and XOR the high i32 with Adjust.
15552 SDValue FistOps[] = { Chain, Value, StackSlot };
15553 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15554 FistOps, DstTy, MMO);
15557 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15558 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15561 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15562 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15564 if (Subtarget.is64Bit()) {
15565 // Join High32 and Low32 into a 64-bit result.
15566 // (High32 << 32) | Low32
15567 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15568 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15569 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15570 DAG.getConstant(32, DL, MVT::i8));
15571 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15572 return std::make_pair(Result, SDValue());
15575 SDValue ResultOps[] = { Low32, High32 };
15577 SDValue pair = IsReplace
15578 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15579 : DAG.getMergeValues(ResultOps, DL);
15580 return std::make_pair(pair, SDValue());
15582 // Build the FP_TO_INT*_IN_MEM
15583 SDValue Ops[] = { Chain, Value, StackSlot };
15584 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15586 return std::make_pair(FIST, StackSlot);
15590 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15591 const X86Subtarget &Subtarget) {
15592 MVT VT = Op->getSimpleValueType(0);
15593 SDValue In = Op->getOperand(0);
15594 MVT InVT = In.getSimpleValueType();
15597 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15598 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15600 // Optimize vectors in AVX mode:
15603 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15604 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15605 // Concat upper and lower parts.
15608 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15609 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15610 // Concat upper and lower parts.
15613 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15614 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15615 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15618 if (Subtarget.hasInt256())
15619 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15621 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15622 SDValue Undef = DAG.getUNDEF(InVT);
15623 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15624 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15625 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15627 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15628 VT.getVectorNumElements()/2);
15630 OpLo = DAG.getBitcast(HVT, OpLo);
15631 OpHi = DAG.getBitcast(HVT, OpHi);
15633 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15636 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15637 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15638 MVT VT = Op->getSimpleValueType(0);
15639 SDValue In = Op->getOperand(0);
15640 MVT InVT = In.getSimpleValueType();
15642 unsigned NumElts = VT.getVectorNumElements();
15644 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15645 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15646 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15648 if (InVT.getVectorElementType() != MVT::i1)
15651 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15653 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15654 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15657 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15659 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15661 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15663 return SelectedVal;
15664 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15667 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15668 SelectionDAG &DAG) {
15669 if (Subtarget.hasFp256())
15670 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15676 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15677 SelectionDAG &DAG) {
15679 MVT VT = Op.getSimpleValueType();
15680 SDValue In = Op.getOperand(0);
15681 MVT SVT = In.getSimpleValueType();
15683 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15684 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15686 if (Subtarget.hasFp256())
15687 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15690 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15691 VT.getVectorNumElements() != SVT.getVectorNumElements());
15695 /// Helper to recursively truncate vector elements in half with PACKSS.
15696 /// It makes use of the fact that vector comparison results will be all-zeros
15697 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15698 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15699 /// within each 128-bit lane.
15700 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15703 const X86Subtarget &Subtarget) {
15704 // Requires SSE2 but AVX512 has fast truncate.
15705 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15708 EVT SrcVT = In.getValueType();
15710 // No truncation required, we might get here due to recursive calls.
15711 if (SrcVT == DstVT)
15714 // We only support vector truncation to 128bits or greater from a
15715 // 256bits or greater source.
15716 if ((DstVT.getSizeInBits() % 128) != 0)
15718 if ((SrcVT.getSizeInBits() % 256) != 0)
15721 unsigned NumElems = SrcVT.getVectorNumElements();
15722 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15723 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15726 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15728 // Extract lower/upper subvectors.
15729 unsigned NumSubElts = NumElems / 2;
15730 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15731 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15732 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15734 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15735 if (SrcVT.is256BitVector()) {
15736 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15737 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15738 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15739 return DAG.getBitcast(DstVT, Res);
15742 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15743 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15744 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15745 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15746 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15747 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15749 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15750 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15751 Res = DAG.getBitcast(MVT::v4i64, Res);
15752 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15754 if (DstVT.is256BitVector())
15755 return DAG.getBitcast(DstVT, Res);
15757 // If 512bit -> 128bit truncate another stage.
15758 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15759 Res = DAG.getBitcast(PackedVT, Res);
15760 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15763 // Recursively pack lower/upper subvectors, concat result and pack again.
15764 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15765 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15766 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15767 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15769 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15770 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15771 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15774 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15775 const X86Subtarget &Subtarget) {
15778 MVT VT = Op.getSimpleValueType();
15779 SDValue In = Op.getOperand(0);
15780 MVT InVT = In.getSimpleValueType();
15782 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15784 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15785 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15786 if (InVT.getScalarSizeInBits() <= 16) {
15787 if (Subtarget.hasBWI()) {
15788 // legal, will go to VPMOVB2M, VPMOVW2M
15789 // Shift packed bytes not supported natively, bitcast to word
15790 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15791 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15792 DAG.getBitcast(ExtVT, In),
15793 DAG.getConstant(ShiftInx, DL, ExtVT));
15794 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15795 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15797 // Use TESTD/Q, extended vector to packed dword/qword.
15798 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15799 "Unexpected vector type.");
15800 unsigned NumElts = InVT.getVectorNumElements();
15801 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15802 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15804 ShiftInx = InVT.getScalarSizeInBits() - 1;
15807 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15808 DAG.getConstant(ShiftInx, DL, InVT));
15809 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15812 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15814 MVT VT = Op.getSimpleValueType();
15815 SDValue In = Op.getOperand(0);
15816 MVT InVT = In.getSimpleValueType();
15818 if (VT == MVT::i1) {
15819 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15820 "Invalid scalar TRUNCATE operation");
15821 if (InVT.getSizeInBits() >= 32)
15823 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15824 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15826 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15827 "Invalid TRUNCATE operation");
15829 if (VT.getVectorElementType() == MVT::i1)
15830 return LowerTruncateVecI1(Op, DAG, Subtarget);
15832 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15833 if (Subtarget.hasAVX512()) {
15834 // word to byte only under BWI
15835 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15836 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15837 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15838 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15841 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15842 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15843 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15846 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15847 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15848 if (Subtarget.hasInt256()) {
15849 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15850 In = DAG.getBitcast(MVT::v8i32, In);
15851 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15852 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15853 DAG.getIntPtrConstant(0, DL));
15856 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15857 DAG.getIntPtrConstant(0, DL));
15858 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15859 DAG.getIntPtrConstant(2, DL));
15860 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15861 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15862 static const int ShufMask[] = {0, 2, 4, 6};
15863 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15866 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15867 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15868 if (Subtarget.hasInt256()) {
15869 In = DAG.getBitcast(MVT::v32i8, In);
15871 // The PSHUFB mask:
15872 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15873 -1, -1, -1, -1, -1, -1, -1, -1,
15874 16, 17, 20, 21, 24, 25, 28, 29,
15875 -1, -1, -1, -1, -1, -1, -1, -1 };
15876 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15877 In = DAG.getBitcast(MVT::v4i64, In);
15879 static const int ShufMask2[] = {0, 2, -1, -1};
15880 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15881 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15882 DAG.getIntPtrConstant(0, DL));
15883 return DAG.getBitcast(VT, In);
15886 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15887 DAG.getIntPtrConstant(0, DL));
15889 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15890 DAG.getIntPtrConstant(4, DL));
15892 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15893 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15895 // The PSHUFB mask:
15896 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15897 -1, -1, -1, -1, -1, -1, -1, -1};
15899 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15900 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15902 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15903 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15905 // The MOVLHPS Mask:
15906 static const int ShufMask2[] = {0, 1, 4, 5};
15907 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15908 return DAG.getBitcast(MVT::v8i16, res);
15911 // Handle truncation of V256 to V128 using shuffles.
15912 if (!VT.is128BitVector() || !InVT.is256BitVector())
15915 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15917 unsigned NumElems = VT.getVectorNumElements();
15918 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15920 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15921 // Prepare truncation shuffle mask
15922 for (unsigned i = 0; i != NumElems; ++i)
15923 MaskVec[i] = i * 2;
15924 In = DAG.getBitcast(NVT, In);
15925 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15926 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15927 DAG.getIntPtrConstant(0, DL));
15930 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15931 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15932 MVT VT = Op.getSimpleValueType();
15934 if (VT.isVector()) {
15935 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15936 SDValue Src = Op.getOperand(0);
15938 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15939 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15940 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15941 DAG.getUNDEF(MVT::v2f32)));
15947 assert(!VT.isVector());
15949 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15950 IsSigned, /*IsReplace=*/ false);
15951 SDValue FIST = Vals.first, StackSlot = Vals.second;
15952 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15953 if (!FIST.getNode())
15956 if (StackSlot.getNode())
15957 // Load the result.
15958 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15960 // The node is the result.
15964 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15966 MVT VT = Op.getSimpleValueType();
15967 SDValue In = Op.getOperand(0);
15968 MVT SVT = In.getSimpleValueType();
15970 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15972 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15973 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15974 In, DAG.getUNDEF(SVT)));
15977 /// The only differences between FABS and FNEG are the mask and the logic op.
15978 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15979 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15980 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15981 "Wrong opcode for lowering FABS or FNEG.");
15983 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15985 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15986 // into an FNABS. We'll lower the FABS after that if it is still in use.
15988 for (SDNode *User : Op->uses())
15989 if (User->getOpcode() == ISD::FNEG)
15993 MVT VT = Op.getSimpleValueType();
15995 bool IsF128 = (VT == MVT::f128);
15997 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15998 // decide if we should generate a 16-byte constant mask when we only need 4 or
15999 // 8 bytes for the scalar case.
16004 if (VT.isVector()) {
16006 EltVT = VT.getVectorElementType();
16007 } else if (IsF128) {
16008 // SSE instructions are used for optimized f128 logical operations.
16009 LogicVT = MVT::f128;
16012 // There are no scalar bitwise logical SSE/AVX instructions, so we
16013 // generate a 16-byte vector constant and logic op even for the scalar case.
16014 // Using a 16-byte mask allows folding the load of the mask with
16015 // the logic op, so it can save (~4 bytes) on code size.
16016 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16020 unsigned EltBits = EltVT.getSizeInBits();
16021 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16023 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16024 const fltSemantics &Sem =
16025 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16026 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16027 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16029 SDValue Op0 = Op.getOperand(0);
16030 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16032 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16033 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16035 if (VT.isVector() || IsF128)
16036 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16038 // For the scalar case extend to a 128-bit vector, perform the logic op,
16039 // and extract the scalar result back out.
16040 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16041 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16043 DAG.getIntPtrConstant(0, dl));
16046 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16047 SDValue Mag = Op.getOperand(0);
16048 SDValue Sign = Op.getOperand(1);
16051 // If the sign operand is smaller, extend it first.
16052 MVT VT = Op.getSimpleValueType();
16053 if (Sign.getSimpleValueType().bitsLT(VT))
16054 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16056 // And if it is bigger, shrink it first.
16057 if (Sign.getSimpleValueType().bitsGT(VT))
16058 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16060 // At this point the operands and the result should have the same
16061 // type, and that won't be f80 since that is not custom lowered.
16062 bool IsF128 = (VT == MVT::f128);
16063 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16064 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16065 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16066 "Unexpected type in LowerFCOPYSIGN");
16068 MVT EltVT = VT.getScalarType();
16069 const fltSemantics &Sem =
16070 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16071 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16073 // Perform all scalar logic operations as 16-byte vectors because there are no
16074 // scalar FP logic instructions in SSE.
16075 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16076 // unnecessary splats, but we might miss load folding opportunities. Should
16077 // this decision be based on OptimizeForSize?
16078 bool IsFakeVector = !VT.isVector() && !IsF128;
16081 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16083 // The mask constants are automatically splatted for vector types.
16084 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16085 SDValue SignMask = DAG.getConstantFP(
16086 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16087 SDValue MagMask = DAG.getConstantFP(
16088 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16090 // First, clear all bits but the sign bit from the second operand (sign).
16092 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16093 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16095 // Next, clear the sign bit from the first operand (magnitude).
16096 // TODO: If we had general constant folding for FP logic ops, this check
16097 // wouldn't be necessary.
16099 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16100 APFloat APF = Op0CN->getValueAPF();
16102 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16104 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16106 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16107 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16110 // OR the magnitude value with the sign bit.
16111 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16112 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16113 DAG.getIntPtrConstant(0, dl));
16116 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16117 SDValue N0 = Op.getOperand(0);
16119 MVT VT = Op.getSimpleValueType();
16121 MVT OpVT = N0.getSimpleValueType();
16122 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16123 "Unexpected type for FGETSIGN");
16125 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16126 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16127 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16128 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16129 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16130 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16134 // Check whether an OR'd tree is PTEST-able.
16135 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16136 SelectionDAG &DAG) {
16137 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16139 if (!Subtarget.hasSSE41())
16142 if (!Op->hasOneUse())
16145 SDNode *N = Op.getNode();
16148 SmallVector<SDValue, 8> Opnds;
16149 DenseMap<SDValue, unsigned> VecInMap;
16150 SmallVector<SDValue, 8> VecIns;
16151 EVT VT = MVT::Other;
16153 // Recognize a special case where a vector is casted into wide integer to
16155 Opnds.push_back(N->getOperand(0));
16156 Opnds.push_back(N->getOperand(1));
16158 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16159 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16160 // BFS traverse all OR'd operands.
16161 if (I->getOpcode() == ISD::OR) {
16162 Opnds.push_back(I->getOperand(0));
16163 Opnds.push_back(I->getOperand(1));
16164 // Re-evaluate the number of nodes to be traversed.
16165 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16169 // Quit if a non-EXTRACT_VECTOR_ELT
16170 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16173 // Quit if without a constant index.
16174 SDValue Idx = I->getOperand(1);
16175 if (!isa<ConstantSDNode>(Idx))
16178 SDValue ExtractedFromVec = I->getOperand(0);
16179 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16180 if (M == VecInMap.end()) {
16181 VT = ExtractedFromVec.getValueType();
16182 // Quit if not 128/256-bit vector.
16183 if (!VT.is128BitVector() && !VT.is256BitVector())
16185 // Quit if not the same type.
16186 if (VecInMap.begin() != VecInMap.end() &&
16187 VT != VecInMap.begin()->first.getValueType())
16189 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16190 VecIns.push_back(ExtractedFromVec);
16192 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16195 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16196 "Not extracted from 128-/256-bit vector.");
16198 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16200 for (DenseMap<SDValue, unsigned>::const_iterator
16201 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16202 // Quit if not all elements are used.
16203 if (I->second != FullMask)
16207 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16209 // Cast all vectors into TestVT for PTEST.
16210 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16211 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16213 // If more than one full vector is evaluated, OR them first before PTEST.
16214 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16215 // Each iteration will OR 2 nodes and append the result until there is only
16216 // 1 node left, i.e. the final OR'd value of all vectors.
16217 SDValue LHS = VecIns[Slot];
16218 SDValue RHS = VecIns[Slot + 1];
16219 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16222 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16225 /// \brief return true if \c Op has a use that doesn't just read flags.
16226 static bool hasNonFlagsUse(SDValue Op) {
16227 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16229 SDNode *User = *UI;
16230 unsigned UOpNo = UI.getOperandNo();
16231 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16232 // Look pass truncate.
16233 UOpNo = User->use_begin().getOperandNo();
16234 User = *User->use_begin();
16237 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16238 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16244 // Emit KTEST instruction for bit vectors on AVX-512
16245 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16246 const X86Subtarget &Subtarget) {
16247 if (Op.getOpcode() == ISD::BITCAST) {
16248 auto hasKTEST = [&](MVT VT) {
16249 unsigned SizeInBits = VT.getSizeInBits();
16250 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16251 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16253 SDValue Op0 = Op.getOperand(0);
16254 MVT Op0VT = Op0.getValueType().getSimpleVT();
16255 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16257 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16262 /// Emit nodes that will be selected as "test Op0,Op0", or something
16264 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16265 SelectionDAG &DAG) const {
16266 if (Op.getValueType() == MVT::i1) {
16267 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16268 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16269 DAG.getConstant(0, dl, MVT::i8));
16271 // CF and OF aren't always set the way we want. Determine which
16272 // of these we need.
16273 bool NeedCF = false;
16274 bool NeedOF = false;
16277 case X86::COND_A: case X86::COND_AE:
16278 case X86::COND_B: case X86::COND_BE:
16281 case X86::COND_G: case X86::COND_GE:
16282 case X86::COND_L: case X86::COND_LE:
16283 case X86::COND_O: case X86::COND_NO: {
16284 // Check if we really need to set the
16285 // Overflow flag. If NoSignedWrap is present
16286 // that is not actually needed.
16287 switch (Op->getOpcode()) {
16292 if (Op.getNode()->getFlags().hasNoSignedWrap())
16301 // See if we can use the EFLAGS value from the operand instead of
16302 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16303 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16304 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16305 // Emit KTEST for bit vectors
16306 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16308 // Emit a CMP with 0, which is the TEST pattern.
16309 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16310 DAG.getConstant(0, dl, Op.getValueType()));
16312 unsigned Opcode = 0;
16313 unsigned NumOperands = 0;
16315 // Truncate operations may prevent the merge of the SETCC instruction
16316 // and the arithmetic instruction before it. Attempt to truncate the operands
16317 // of the arithmetic instruction and use a reduced bit-width instruction.
16318 bool NeedTruncation = false;
16319 SDValue ArithOp = Op;
16320 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16321 SDValue Arith = Op->getOperand(0);
16322 // Both the trunc and the arithmetic op need to have one user each.
16323 if (Arith->hasOneUse())
16324 switch (Arith.getOpcode()) {
16331 NeedTruncation = true;
16337 // Sometimes flags can be set either with an AND or with an SRL/SHL
16338 // instruction. SRL/SHL variant should be preferred for masks longer than this
16340 const int ShiftToAndMaxMaskWidth = 32;
16341 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16343 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16344 // which may be the result of a CAST. We use the variable 'Op', which is the
16345 // non-casted variable when we check for possible users.
16346 switch (ArithOp.getOpcode()) {
16348 // Due to an isel shortcoming, be conservative if this add is likely to be
16349 // selected as part of a load-modify-store instruction. When the root node
16350 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16351 // uses of other nodes in the match, such as the ADD in this case. This
16352 // leads to the ADD being left around and reselected, with the result being
16353 // two adds in the output. Alas, even if none our users are stores, that
16354 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16355 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16356 // climbing the DAG back to the root, and it doesn't seem to be worth the
16358 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16359 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16360 if (UI->getOpcode() != ISD::CopyToReg &&
16361 UI->getOpcode() != ISD::SETCC &&
16362 UI->getOpcode() != ISD::STORE)
16365 if (ConstantSDNode *C =
16366 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16367 // An add of one will be selected as an INC.
16368 if (C->isOne() && !Subtarget.slowIncDec()) {
16369 Opcode = X86ISD::INC;
16374 // An add of negative one (subtract of one) will be selected as a DEC.
16375 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16376 Opcode = X86ISD::DEC;
16382 // Otherwise use a regular EFLAGS-setting add.
16383 Opcode = X86ISD::ADD;
16388 // If we have a constant logical shift that's only used in a comparison
16389 // against zero turn it into an equivalent AND. This allows turning it into
16390 // a TEST instruction later.
16391 if (ZeroCheck && Op->hasOneUse() &&
16392 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16393 EVT VT = Op.getValueType();
16394 unsigned BitWidth = VT.getSizeInBits();
16395 unsigned ShAmt = Op->getConstantOperandVal(1);
16396 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16398 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16399 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16400 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16401 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16403 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16404 DAG.getConstant(Mask, dl, VT));
16409 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16410 // because a TEST instruction will be better. However, AND should be
16411 // preferred if the instruction can be combined into ANDN.
16412 if (!hasNonFlagsUse(Op)) {
16413 SDValue Op0 = ArithOp->getOperand(0);
16414 SDValue Op1 = ArithOp->getOperand(1);
16415 EVT VT = ArithOp.getValueType();
16416 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16417 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16418 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16420 // If we cannot select an ANDN instruction, check if we can replace
16421 // AND+IMM64 with a shift before giving up. This is possible for masks
16422 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16423 if (!isProperAndn) {
16427 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16428 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16432 const APInt &Mask = CN->getAPIntValue();
16433 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16434 break; // Prefer TEST instruction.
16436 unsigned BitWidth = Mask.getBitWidth();
16437 unsigned LeadingOnes = Mask.countLeadingOnes();
16438 unsigned TrailingZeros = Mask.countTrailingZeros();
16440 if (LeadingOnes + TrailingZeros == BitWidth) {
16441 assert(TrailingZeros < VT.getSizeInBits() &&
16442 "Shift amount should be less than the type width");
16443 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16444 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16445 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16449 unsigned LeadingZeros = Mask.countLeadingZeros();
16450 unsigned TrailingOnes = Mask.countTrailingOnes();
16452 if (LeadingZeros + TrailingOnes == BitWidth) {
16453 assert(LeadingZeros < VT.getSizeInBits() &&
16454 "Shift amount should be less than the type width");
16455 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16456 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16457 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16468 // Due to the ISEL shortcoming noted above, be conservative if this op is
16469 // likely to be selected as part of a load-modify-store instruction.
16470 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16471 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16472 if (UI->getOpcode() == ISD::STORE)
16475 // Otherwise use a regular EFLAGS-setting instruction.
16476 switch (ArithOp.getOpcode()) {
16477 default: llvm_unreachable("unexpected operator!");
16478 case ISD::SUB: Opcode = X86ISD::SUB; break;
16479 case ISD::XOR: Opcode = X86ISD::XOR; break;
16480 case ISD::AND: Opcode = X86ISD::AND; break;
16482 if (!NeedTruncation && ZeroCheck) {
16483 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16486 Opcode = X86ISD::OR;
16500 return SDValue(Op.getNode(), 1);
16506 // If we found that truncation is beneficial, perform the truncation and
16508 if (NeedTruncation) {
16509 EVT VT = Op.getValueType();
16510 SDValue WideVal = Op->getOperand(0);
16511 EVT WideVT = WideVal.getValueType();
16512 unsigned ConvertedOp = 0;
16513 // Use a target machine opcode to prevent further DAGCombine
16514 // optimizations that may separate the arithmetic operations
16515 // from the setcc node.
16516 switch (WideVal.getOpcode()) {
16518 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16519 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16520 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16521 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16522 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16527 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16528 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16529 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16530 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16536 // Emit KTEST for bit vectors
16537 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16540 // Emit a CMP with 0, which is the TEST pattern.
16541 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16542 DAG.getConstant(0, dl, Op.getValueType()));
16544 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16545 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16547 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16548 DAG.ReplaceAllUsesWith(Op, New);
16549 return SDValue(New.getNode(), 1);
16552 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16554 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16555 const SDLoc &dl, SelectionDAG &DAG) const {
16556 if (isNullConstant(Op1))
16557 return EmitTest(Op0, X86CC, dl, DAG);
16559 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16560 "Unexpected comparison operation for MVT::i1 operands");
16562 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16563 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16564 // Only promote the compare up to I32 if it is a 16 bit operation
16565 // with an immediate. 16 bit immediates are to be avoided.
16566 if ((Op0.getValueType() == MVT::i16 &&
16567 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16568 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16569 !Subtarget.isAtom()) {
16570 unsigned ExtendOp =
16571 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16572 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16573 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16575 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16576 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16577 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16579 return SDValue(Sub.getNode(), 1);
16581 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16584 /// Convert a comparison if required by the subtarget.
16585 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16586 SelectionDAG &DAG) const {
16587 // If the subtarget does not support the FUCOMI instruction, floating-point
16588 // comparisons have to be converted.
16589 if (Subtarget.hasCMov() ||
16590 Cmp.getOpcode() != X86ISD::CMP ||
16591 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16592 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16595 // The instruction selector will select an FUCOM instruction instead of
16596 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16597 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16598 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16600 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16601 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16602 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16603 DAG.getConstant(8, dl, MVT::i8));
16604 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16606 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16607 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16608 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16611 /// Check if replacement of SQRT with RSQRT should be disabled.
16612 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16613 EVT VT = Op.getValueType();
16615 // We never want to use both SQRT and RSQRT instructions for the same input.
16616 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16620 return Subtarget.hasFastVectorFSQRT();
16621 return Subtarget.hasFastScalarFSQRT();
16624 /// The minimum architected relative accuracy is 2^-12. We need one
16625 /// Newton-Raphson step to have a good float result (24 bits of precision).
16626 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16627 SelectionDAG &DAG, int Enabled,
16628 int &RefinementSteps,
16629 bool &UseOneConstNR,
16630 bool Reciprocal) const {
16631 EVT VT = Op.getValueType();
16633 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16634 // TODO: Add support for AVX512 (v16f32).
16635 // It is likely not profitable to do this for f64 because a double-precision
16636 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16637 // instructions: convert to single, rsqrtss, convert back to double, refine
16638 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16639 // along with FMA, this could be a throughput win.
16640 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16641 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16642 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16643 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16644 RefinementSteps = 1;
16646 UseOneConstNR = false;
16647 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16652 /// The minimum architected relative accuracy is 2^-12. We need one
16653 /// Newton-Raphson step to have a good float result (24 bits of precision).
16654 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16656 int &RefinementSteps) const {
16657 EVT VT = Op.getValueType();
16659 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16660 // TODO: Add support for AVX512 (v16f32).
16661 // It is likely not profitable to do this for f64 because a double-precision
16662 // reciprocal estimate with refinement on x86 prior to FMA requires
16663 // 15 instructions: convert to single, rcpss, convert back to double, refine
16664 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16665 // along with FMA, this could be a throughput win.
16667 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16668 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16669 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16670 // Enable estimate codegen with 1 refinement step for vector division.
16671 // Scalar division estimates are disabled because they break too much
16672 // real-world code. These defaults are intended to match GCC behavior.
16673 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16676 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16677 RefinementSteps = 1;
16679 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16684 /// If we have at least two divisions that use the same divisor, convert to
16685 /// multiplication by a reciprocal. This may need to be adjusted for a given
16686 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16687 /// This is because we still need one division to calculate the reciprocal and
16688 /// then we need two multiplies by that reciprocal as replacements for the
16689 /// original divisions.
16690 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16694 /// Helper for creating a X86ISD::SETCC node.
16695 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16696 SelectionDAG &DAG) {
16697 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16698 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16701 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16702 /// according to equal/not-equal condition code \p CC.
16703 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16704 const SDLoc &dl, SelectionDAG &DAG) {
16705 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16706 // instruction. Since the shift amount is in-range-or-undefined, we know
16707 // that doing a bittest on the i32 value is ok. We extend to i32 because
16708 // the encoding for the i16 version is larger than the i32 version.
16709 // Also promote i16 to i32 for performance / code size reason.
16710 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16711 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16713 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16714 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16715 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16716 // known to be zero.
16717 if (Src.getValueType() == MVT::i64 &&
16718 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16719 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16721 // If the operand types disagree, extend the shift amount to match. Since
16722 // BT ignores high bits (like shifts) we can use anyextend.
16723 if (Src.getValueType() != BitNo.getValueType())
16724 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16726 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16727 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16728 return getSETCC(Cond, BT, dl , DAG);
16731 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16732 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16733 const SDLoc &dl, SelectionDAG &DAG) {
16734 SDValue Op0 = And.getOperand(0);
16735 SDValue Op1 = And.getOperand(1);
16736 if (Op0.getOpcode() == ISD::TRUNCATE)
16737 Op0 = Op0.getOperand(0);
16738 if (Op1.getOpcode() == ISD::TRUNCATE)
16739 Op1 = Op1.getOperand(0);
16742 if (Op1.getOpcode() == ISD::SHL)
16743 std::swap(Op0, Op1);
16744 if (Op0.getOpcode() == ISD::SHL) {
16745 if (isOneConstant(Op0.getOperand(0))) {
16746 // If we looked past a truncate, check that it's only truncating away
16748 unsigned BitWidth = Op0.getValueSizeInBits();
16749 unsigned AndBitWidth = And.getValueSizeInBits();
16750 if (BitWidth > AndBitWidth) {
16752 DAG.computeKnownBits(Op0, Known);
16753 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16757 RHS = Op0.getOperand(1);
16759 } else if (Op1.getOpcode() == ISD::Constant) {
16760 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16761 uint64_t AndRHSVal = AndRHS->getZExtValue();
16762 SDValue AndLHS = Op0;
16764 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16765 LHS = AndLHS.getOperand(0);
16766 RHS = AndLHS.getOperand(1);
16769 // Use BT if the immediate can't be encoded in a TEST instruction.
16770 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16772 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16777 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16782 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16783 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16784 const SDLoc &dl, SelectionDAG &DAG) {
16786 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16787 "Expected TRUNCATE to i1 node");
16789 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16792 SDValue ShiftRight = Op.getOperand(0);
16793 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16797 /// Result of 'and' or 'trunc to i1' is compared against zero.
16798 /// Change to a BT node if possible.
16799 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16800 const SDLoc &dl, SelectionDAG &DAG) const {
16801 if (Op.getOpcode() == ISD::AND)
16802 return LowerAndToBT(Op, CC, dl, DAG);
16803 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16804 return LowerTruncateToBT(Op, CC, dl, DAG);
16808 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16810 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16815 // SSE Condition code mapping:
16824 switch (SetCCOpcode) {
16825 default: llvm_unreachable("Unexpected SETCC condition");
16827 case ISD::SETEQ: SSECC = 0; break;
16829 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16831 case ISD::SETOLT: SSECC = 1; break;
16833 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16835 case ISD::SETOLE: SSECC = 2; break;
16836 case ISD::SETUO: SSECC = 3; break;
16838 case ISD::SETNE: SSECC = 4; break;
16839 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16840 case ISD::SETUGE: SSECC = 5; break;
16841 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16842 case ISD::SETUGT: SSECC = 6; break;
16843 case ISD::SETO: SSECC = 7; break;
16845 case ISD::SETONE: SSECC = 8; break;
16848 std::swap(Op0, Op1);
16853 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16854 /// concatenate the result back.
16855 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16856 MVT VT = Op.getSimpleValueType();
16858 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16859 "Unsupported value type for operation");
16861 unsigned NumElems = VT.getVectorNumElements();
16863 SDValue CC = Op.getOperand(2);
16865 // Extract the LHS vectors
16866 SDValue LHS = Op.getOperand(0);
16867 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16868 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16870 // Extract the RHS vectors
16871 SDValue RHS = Op.getOperand(1);
16872 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16873 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16875 // Issue the operation on the smaller types and concatenate the result back
16876 MVT EltVT = VT.getVectorElementType();
16877 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16878 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16879 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16880 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16883 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16884 SDValue Op0 = Op.getOperand(0);
16885 SDValue Op1 = Op.getOperand(1);
16886 SDValue CC = Op.getOperand(2);
16887 MVT VT = Op.getSimpleValueType();
16890 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16891 "Unexpected type for boolean compare operation");
16892 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16893 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16894 DAG.getConstant(-1, dl, VT));
16895 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16896 DAG.getConstant(-1, dl, VT));
16897 switch (SetCCOpcode) {
16898 default: llvm_unreachable("Unexpected SETCC condition");
16900 // (x == y) -> ~(x ^ y)
16901 return DAG.getNode(ISD::XOR, dl, VT,
16902 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16903 DAG.getConstant(-1, dl, VT));
16905 // (x != y) -> (x ^ y)
16906 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16909 // (x > y) -> (x & ~y)
16910 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16913 // (x < y) -> (~x & y)
16914 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16917 // (x <= y) -> (~x | y)
16918 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16921 // (x >=y) -> (x | ~y)
16922 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16926 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16928 SDValue Op0 = Op.getOperand(0);
16929 SDValue Op1 = Op.getOperand(1);
16930 SDValue CC = Op.getOperand(2);
16931 MVT VT = Op.getSimpleValueType();
16934 assert(VT.getVectorElementType() == MVT::i1 &&
16935 "Cannot set masked compare for this operation");
16937 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16939 bool Unsigned = false;
16942 switch (SetCCOpcode) {
16943 default: llvm_unreachable("Unexpected SETCC condition");
16944 case ISD::SETNE: SSECC = 4; break;
16945 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16946 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16947 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16948 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16949 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16950 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16951 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16952 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16953 case ISD::SETLE: SSECC = 2; break;
16957 std::swap(Op0, Op1);
16959 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16960 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16961 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16962 DAG.getConstant(SSECC, dl, MVT::i8));
16965 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16966 /// operand \p Op1. If non-trivial (for example because it's not constant)
16967 /// return an empty value.
16968 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16969 SelectionDAG &DAG) {
16970 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16974 MVT VT = Op1.getSimpleValueType();
16975 MVT EVT = VT.getVectorElementType();
16976 unsigned n = VT.getVectorNumElements();
16977 SmallVector<SDValue, 8> ULTOp1;
16979 for (unsigned i = 0; i < n; ++i) {
16980 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16981 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16984 // Avoid underflow.
16985 APInt Val = Elt->getAPIntValue();
16989 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16992 return DAG.getBuildVector(VT, dl, ULTOp1);
16995 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16996 SelectionDAG &DAG) {
16997 SDValue Op0 = Op.getOperand(0);
16998 SDValue Op1 = Op.getOperand(1);
16999 SDValue CC = Op.getOperand(2);
17000 MVT VT = Op.getSimpleValueType();
17001 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17002 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17007 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17008 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17012 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17013 assert(VT.getVectorNumElements() <= 16);
17014 Opc = X86ISD::CMPM;
17016 Opc = X86ISD::CMPP;
17017 // The SSE/AVX packed FP comparison nodes are defined with a
17018 // floating-point vector result that matches the operand type. This allows
17019 // them to work with an SSE1 target (integer vector types are not legal).
17020 VT = Op0.getSimpleValueType();
17023 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17024 // emit two comparisons and a logic op to tie them together.
17025 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17028 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17030 // LLVM predicate is SETUEQ or SETONE.
17032 unsigned CombineOpc;
17033 if (SetCCOpcode == ISD::SETUEQ) {
17036 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17037 static_cast<unsigned>(ISD::OR);
17039 assert(SetCCOpcode == ISD::SETONE);
17042 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17043 static_cast<unsigned>(ISD::AND);
17046 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17047 DAG.getConstant(CC0, dl, MVT::i8));
17048 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17049 DAG.getConstant(CC1, dl, MVT::i8));
17050 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17052 // Handle all other FP comparisons here.
17053 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17054 DAG.getConstant(SSECC, dl, MVT::i8));
17057 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17058 // result type of SETCC. The bitcast is expected to be optimized away
17059 // during combining/isel.
17060 if (Opc == X86ISD::CMPP)
17061 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17066 MVT VTOp0 = Op0.getSimpleValueType();
17067 assert(VTOp0 == Op1.getSimpleValueType() &&
17068 "Expected operands with same type!");
17069 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17070 "Invalid number of packed elements for source and destination!");
17072 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17073 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17074 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17075 // legalizer firstly checks if the first operand in input to the setcc has
17076 // a legal type. If so, then it promotes the return type to that same type.
17077 // Otherwise, the return type is promoted to the 'next legal type' which,
17078 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17080 // We reach this code only if the following two conditions are met:
17081 // 1. Both return type and operand type have been promoted to wider types
17082 // by the type legalizer.
17083 // 2. The original operand type has been promoted to a 256-bit vector.
17085 // Note that condition 2. only applies for AVX targets.
17086 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17087 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17090 // The non-AVX512 code below works under the assumption that source and
17091 // destination types are the same.
17092 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17093 "Value types for source and destination must be the same!");
17095 // Break 256-bit integer vector compare into smaller ones.
17096 if (VT.is256BitVector() && !Subtarget.hasInt256())
17097 return Lower256IntVSETCC(Op, DAG);
17099 // Operands are boolean (vectors of i1)
17100 MVT OpVT = Op1.getSimpleValueType();
17101 if (OpVT.getVectorElementType() == MVT::i1)
17102 return LowerBoolVSETCC_AVX512(Op, DAG);
17104 // The result is boolean, but operands are int/float
17105 if (VT.getVectorElementType() == MVT::i1) {
17106 // In AVX-512 architecture setcc returns mask with i1 elements,
17107 // But there is no compare instruction for i8 and i16 elements in KNL.
17108 // In this case use SSE compare
17109 bool UseAVX512Inst =
17110 (OpVT.is512BitVector() ||
17111 OpVT.getScalarSizeInBits() >= 32 ||
17112 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17115 return LowerIntVSETCC_AVX512(Op, DAG);
17117 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17118 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17121 // Lower using XOP integer comparisons.
17122 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17123 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17124 // Translate compare code to XOP PCOM compare mode.
17125 unsigned CmpMode = 0;
17126 switch (SetCCOpcode) {
17127 default: llvm_unreachable("Unexpected SETCC condition");
17129 case ISD::SETLT: CmpMode = 0x00; break;
17131 case ISD::SETLE: CmpMode = 0x01; break;
17133 case ISD::SETGT: CmpMode = 0x02; break;
17135 case ISD::SETGE: CmpMode = 0x03; break;
17136 case ISD::SETEQ: CmpMode = 0x04; break;
17137 case ISD::SETNE: CmpMode = 0x05; break;
17140 // Are we comparing unsigned or signed integers?
17141 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17142 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17144 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17145 DAG.getConstant(CmpMode, dl, MVT::i8));
17148 // We are handling one of the integer comparisons here. Since SSE only has
17149 // GT and EQ comparisons for integer, swapping operands and multiple
17150 // operations may be required for some comparisons.
17152 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17153 bool Subus = false;
17155 switch (SetCCOpcode) {
17156 default: llvm_unreachable("Unexpected SETCC condition");
17157 case ISD::SETNE: Invert = true;
17158 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17159 case ISD::SETLT: Swap = true;
17160 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17161 case ISD::SETGE: Swap = true;
17162 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17163 Invert = true; break;
17164 case ISD::SETULT: Swap = true;
17165 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17166 FlipSigns = true; break;
17167 case ISD::SETUGE: Swap = true;
17168 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17169 FlipSigns = true; Invert = true; break;
17172 // Special case: Use min/max operations for SETULE/SETUGE
17173 MVT VET = VT.getVectorElementType();
17175 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17176 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17179 switch (SetCCOpcode) {
17181 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17182 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17185 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17188 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17189 if (!MinMax && hasSubus) {
17190 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17192 // t = psubus Op0, Op1
17193 // pcmpeq t, <0..0>
17194 switch (SetCCOpcode) {
17196 case ISD::SETULT: {
17197 // If the comparison is against a constant we can turn this into a
17198 // setule. With psubus, setule does not require a swap. This is
17199 // beneficial because the constant in the register is no longer
17200 // destructed as the destination so it can be hoisted out of a loop.
17201 // Only do this pre-AVX since vpcmp* is no longer destructive.
17202 if (Subtarget.hasAVX())
17204 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17206 Subus = true; Invert = false; Swap = false;
17210 // Psubus is better than flip-sign because it requires no inversion.
17211 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17212 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17216 Opc = X86ISD::SUBUS;
17222 std::swap(Op0, Op1);
17224 // Check that the operation in question is available (most are plain SSE2,
17225 // but PCMPGTQ and PCMPEQQ have different requirements).
17226 if (VT == MVT::v2i64) {
17227 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17228 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17230 // First cast everything to the right type.
17231 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17232 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17234 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17235 // bits of the inputs before performing those operations. The lower
17236 // compare is always unsigned.
17239 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17241 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17242 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17243 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17245 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17246 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17248 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17249 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17250 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17252 // Create masks for only the low parts/high parts of the 64 bit integers.
17253 static const int MaskHi[] = { 1, 1, 3, 3 };
17254 static const int MaskLo[] = { 0, 0, 2, 2 };
17255 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17256 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17257 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17259 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17260 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17263 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17265 return DAG.getBitcast(VT, Result);
17268 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17269 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17270 // pcmpeqd + pshufd + pand.
17271 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17273 // First cast everything to the right type.
17274 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17275 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17278 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17280 // Make sure the lower and upper halves are both all-ones.
17281 static const int Mask[] = { 1, 0, 3, 2 };
17282 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17283 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17286 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17288 return DAG.getBitcast(VT, Result);
17292 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17293 // bits of the inputs before performing those operations.
17295 MVT EltVT = VT.getVectorElementType();
17296 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17298 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17299 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17302 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17304 // If the logical-not of the result is required, perform that now.
17306 Result = DAG.getNOT(dl, Result, VT);
17309 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17312 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17313 getZeroVector(VT, Subtarget, DAG, dl));
17318 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17320 MVT VT = Op.getSimpleValueType();
17322 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17324 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17325 SDValue Op0 = Op.getOperand(0);
17326 SDValue Op1 = Op.getOperand(1);
17328 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17330 // Optimize to BT if possible.
17331 // Lower (X & (1 << N)) == 0 to BT(X, N).
17332 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17333 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17334 // Lower (trunc (X >> N) to i1) to BT(X, N).
17335 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17336 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17337 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17339 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17344 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17346 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17347 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17349 // If the input is a setcc, then reuse the input setcc or use a new one with
17350 // the inverted condition.
17351 if (Op0.getOpcode() == X86ISD::SETCC) {
17352 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17353 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17357 CCode = X86::GetOppositeBranchCondition(CCode);
17358 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17360 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17364 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17365 if (isOneConstant(Op1)) {
17366 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17367 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17369 if (!isNullConstant(Op1)) {
17370 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17371 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17375 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17376 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17377 if (X86CC == X86::COND_INVALID)
17380 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17381 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17382 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17384 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17388 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17389 SDValue LHS = Op.getOperand(0);
17390 SDValue RHS = Op.getOperand(1);
17391 SDValue Carry = Op.getOperand(2);
17392 SDValue Cond = Op.getOperand(3);
17395 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17396 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17398 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17399 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17400 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17401 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17402 if (Op.getSimpleValueType() == MVT::i1)
17403 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17407 /// Return true if opcode is a X86 logical comparison.
17408 static bool isX86LogicalCmp(SDValue Op) {
17409 unsigned Opc = Op.getOpcode();
17410 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17411 Opc == X86ISD::SAHF)
17413 if (Op.getResNo() == 1 &&
17414 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17415 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17416 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17417 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17420 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17426 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17427 if (V.getOpcode() != ISD::TRUNCATE)
17430 SDValue VOp0 = V.getOperand(0);
17431 unsigned InBits = VOp0.getValueSizeInBits();
17432 unsigned Bits = V.getValueSizeInBits();
17433 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17436 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17437 bool AddTest = true;
17438 SDValue Cond = Op.getOperand(0);
17439 SDValue Op1 = Op.getOperand(1);
17440 SDValue Op2 = Op.getOperand(2);
17442 MVT VT = Op1.getSimpleValueType();
17445 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17446 // are available or VBLENDV if AVX is available.
17447 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17448 if (Cond.getOpcode() == ISD::SETCC &&
17449 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17450 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17451 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17452 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17453 int SSECC = translateX86FSETCC(
17454 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17457 if (Subtarget.hasAVX512()) {
17458 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17459 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17460 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17461 DL, VT, Cmp, Op1, Op2);
17464 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17465 DAG.getConstant(SSECC, DL, MVT::i8));
17467 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17468 // of 3 logic instructions for size savings and potentially speed.
17469 // Unfortunately, there is no scalar form of VBLENDV.
17471 // If either operand is a constant, don't try this. We can expect to
17472 // optimize away at least one of the logic instructions later in that
17473 // case, so that sequence would be faster than a variable blend.
17475 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17476 // uses XMM0 as the selection register. That may need just as many
17477 // instructions as the AND/ANDN/OR sequence due to register moves, so
17480 if (Subtarget.hasAVX() &&
17481 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17483 // Convert to vectors, do a VSELECT, and convert back to scalar.
17484 // All of the conversions should be optimized away.
17486 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17487 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17488 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17489 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17491 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17492 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17494 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17496 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17497 VSel, DAG.getIntPtrConstant(0, DL));
17499 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17500 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17501 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17505 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17506 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17507 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17508 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17511 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17513 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17514 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17515 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17516 Op1Scalar = Op1.getOperand(0);
17518 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17519 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17520 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17521 Op2Scalar = Op2.getOperand(0);
17522 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17523 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17524 Op1Scalar, Op2Scalar);
17525 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17526 return DAG.getBitcast(VT, newSelect);
17527 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17528 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17529 DAG.getIntPtrConstant(0, DL));
17533 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17534 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17535 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17536 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17537 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17538 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17539 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17540 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17543 if (Cond.getOpcode() == ISD::SETCC) {
17544 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17546 // If the condition was updated, it's possible that the operands of the
17547 // select were also updated (for example, EmitTest has a RAUW). Refresh
17548 // the local references to the select operands in case they got stale.
17549 Op1 = Op.getOperand(1);
17550 Op2 = Op.getOperand(2);
17554 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17555 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17556 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17557 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17558 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17559 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17560 if (Cond.getOpcode() == X86ISD::SETCC &&
17561 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17562 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17563 SDValue Cmp = Cond.getOperand(1);
17564 unsigned CondCode =
17565 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17567 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17568 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17569 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17571 SDValue CmpOp0 = Cmp.getOperand(0);
17572 // Apply further optimizations for special cases
17573 // (select (x != 0), -1, 0) -> neg & sbb
17574 // (select (x == 0), 0, -1) -> neg & sbb
17575 if (isNullConstant(Y) &&
17576 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17577 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17578 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17579 DAG.getConstant(0, DL,
17580 CmpOp0.getValueType()),
17582 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17583 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17584 SDValue(Neg.getNode(), 1));
17588 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17589 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17590 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17592 SDValue Res = // Res = 0 or -1.
17593 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17594 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17596 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17597 Res = DAG.getNOT(DL, Res, Res.getValueType());
17599 if (!isNullConstant(Op2))
17600 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17602 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17603 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17604 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17605 SDValue CmpOp0 = Cmp.getOperand(0);
17606 SDValue Src1, Src2;
17607 // true if Op2 is XOR or OR operator and one of its operands
17609 // ( a , a op b) || ( b , a op b)
17610 auto isOrXorPattern = [&]() {
17611 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17612 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17614 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17621 if (isOrXorPattern()) {
17623 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17624 // we need mask of all zeros or ones with same size of the other
17626 if (CmpSz > VT.getSizeInBits())
17627 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17628 else if (CmpSz < VT.getSizeInBits())
17629 Neg = DAG.getNode(ISD::AND, DL, VT,
17630 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17631 DAG.getConstant(1, DL, VT));
17634 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17635 Neg); // -(and (x, 0x1))
17636 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17637 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17642 // Look past (and (setcc_carry (cmp ...)), 1).
17643 if (Cond.getOpcode() == ISD::AND &&
17644 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17645 isOneConstant(Cond.getOperand(1)))
17646 Cond = Cond.getOperand(0);
17648 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17649 // setting operand in place of the X86ISD::SETCC.
17650 unsigned CondOpcode = Cond.getOpcode();
17651 if (CondOpcode == X86ISD::SETCC ||
17652 CondOpcode == X86ISD::SETCC_CARRY) {
17653 CC = Cond.getOperand(0);
17655 SDValue Cmp = Cond.getOperand(1);
17656 unsigned Opc = Cmp.getOpcode();
17657 MVT VT = Op.getSimpleValueType();
17659 bool IllegalFPCMov = false;
17660 if (VT.isFloatingPoint() && !VT.isVector() &&
17661 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17662 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17664 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17665 Opc == X86ISD::BT) { // FIXME
17669 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17670 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17671 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17672 Cond.getOperand(0).getValueType() != MVT::i8)) {
17673 SDValue LHS = Cond.getOperand(0);
17674 SDValue RHS = Cond.getOperand(1);
17675 unsigned X86Opcode;
17678 switch (CondOpcode) {
17679 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17680 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17681 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17682 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17683 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17684 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17685 default: llvm_unreachable("unexpected overflowing operator");
17687 if (CondOpcode == ISD::UMULO)
17688 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17691 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17693 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17695 if (CondOpcode == ISD::UMULO)
17696 Cond = X86Op.getValue(2);
17698 Cond = X86Op.getValue(1);
17700 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17705 // Look past the truncate if the high bits are known zero.
17706 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17707 Cond = Cond.getOperand(0);
17709 // We know the result of AND is compared against zero. Try to match
17711 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17712 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17713 CC = NewSetCC.getOperand(0);
17714 Cond = NewSetCC.getOperand(1);
17721 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17722 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17725 // a < b ? -1 : 0 -> RES = ~setcc_carry
17726 // a < b ? 0 : -1 -> RES = setcc_carry
17727 // a >= b ? -1 : 0 -> RES = setcc_carry
17728 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17729 if (Cond.getOpcode() == X86ISD::SUB) {
17730 Cond = ConvertCmpIfNecessary(Cond, DAG);
17731 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17733 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17734 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17735 (isNullConstant(Op1) || isNullConstant(Op2))) {
17736 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17737 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17739 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17740 return DAG.getNOT(DL, Res, Res.getValueType());
17745 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17746 // widen the cmov and push the truncate through. This avoids introducing a new
17747 // branch during isel and doesn't add any extensions.
17748 if (Op.getValueType() == MVT::i8 &&
17749 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17750 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17751 if (T1.getValueType() == T2.getValueType() &&
17752 // Blacklist CopyFromReg to avoid partial register stalls.
17753 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17754 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17755 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17756 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17760 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17761 // condition is true.
17762 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17763 SDValue Ops[] = { Op2, Op1, CC, Cond };
17764 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17767 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17768 const X86Subtarget &Subtarget,
17769 SelectionDAG &DAG) {
17770 MVT VT = Op->getSimpleValueType(0);
17771 SDValue In = Op->getOperand(0);
17772 MVT InVT = In.getSimpleValueType();
17773 MVT VTElt = VT.getVectorElementType();
17774 MVT InVTElt = InVT.getVectorElementType();
17778 if ((InVTElt == MVT::i1) &&
17779 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17781 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17783 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17785 unsigned NumElts = VT.getVectorNumElements();
17787 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17788 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17789 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17790 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17791 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17794 if (InVTElt != MVT::i1)
17798 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17799 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17802 if (Subtarget.hasDQI()) {
17803 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17804 assert(!VT.is512BitVector() && "Unexpected vector type");
17806 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17807 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17808 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17813 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17816 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17817 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17818 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17819 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17820 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17821 const X86Subtarget &Subtarget,
17822 SelectionDAG &DAG) {
17823 SDValue In = Op->getOperand(0);
17824 MVT VT = Op->getSimpleValueType(0);
17825 MVT InVT = In.getSimpleValueType();
17826 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17828 MVT SVT = VT.getVectorElementType();
17829 MVT InSVT = InVT.getVectorElementType();
17830 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17832 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17834 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17836 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17837 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17838 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17843 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17844 // For 512-bit vectors, we need 128-bits or 256-bits.
17845 if (VT.getSizeInBits() > 128) {
17846 // Input needs to be at least the same number of elements as output, and
17847 // at least 128-bits.
17848 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17849 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17852 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17853 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17855 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17856 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17857 // need to be handled here for 256/512-bit results.
17858 if (Subtarget.hasInt256()) {
17859 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17860 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17861 X86ISD::VSEXT : X86ISD::VZEXT;
17862 return DAG.getNode(ExtOpc, dl, VT, In);
17865 // We should only get here for sign extend.
17866 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17867 "Unexpected opcode!");
17869 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17873 // As SRAI is only available on i16/i32 types, we expand only up to i32
17874 // and handle i64 separately.
17875 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17876 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17877 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17878 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17879 Curr = DAG.getBitcast(CurrVT, Curr);
17882 SDValue SignExt = Curr;
17883 if (CurrVT != InVT) {
17884 unsigned SignExtShift =
17885 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17886 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17887 DAG.getConstant(SignExtShift, dl, MVT::i8));
17893 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17894 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17895 DAG.getConstant(31, dl, MVT::i8));
17896 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17897 return DAG.getBitcast(VT, Ext);
17903 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17904 SelectionDAG &DAG) {
17905 MVT VT = Op->getSimpleValueType(0);
17906 SDValue In = Op->getOperand(0);
17907 MVT InVT = In.getSimpleValueType();
17910 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17911 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17913 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17914 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17915 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17918 if (Subtarget.hasInt256())
17919 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17921 // Optimize vectors in AVX mode
17922 // Sign extend v8i16 to v8i32 and
17925 // Divide input vector into two parts
17926 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17927 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17928 // concat the vectors to original VT
17930 unsigned NumElems = InVT.getVectorNumElements();
17931 SDValue Undef = DAG.getUNDEF(InVT);
17933 SmallVector<int,8> ShufMask1(NumElems, -1);
17934 for (unsigned i = 0; i != NumElems/2; ++i)
17937 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17939 SmallVector<int,8> ShufMask2(NumElems, -1);
17940 for (unsigned i = 0; i != NumElems/2; ++i)
17941 ShufMask2[i] = i + NumElems/2;
17943 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17945 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17946 VT.getVectorNumElements() / 2);
17948 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
17949 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
17951 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17954 // Lower truncating store. We need a special lowering to vXi1 vectors
17955 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17956 SelectionDAG &DAG) {
17957 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17959 EVT MemVT = St->getMemoryVT();
17960 assert(St->isTruncatingStore() && "We only custom truncating store.");
17961 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17962 "Expected truncstore of i1 vector");
17964 SDValue Op = St->getValue();
17965 MVT OpVT = Op.getValueType().getSimpleVT();
17966 unsigned NumElts = OpVT.getVectorNumElements();
17967 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17969 // Truncate and store - everything is legal
17970 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17971 if (MemVT.getSizeInBits() < 8)
17972 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17973 DAG.getUNDEF(MVT::v8i1), Op,
17974 DAG.getIntPtrConstant(0, dl));
17975 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17976 St->getMemOperand());
17979 // A subset, assume that we have only AVX-512F
17980 if (NumElts <= 8) {
17982 // Extend to 8-elts vector
17983 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17984 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17985 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17987 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17988 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17989 St->getMemOperand());
17992 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17993 // Divide the vector into 2 parts and store each part separately
17994 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17995 DAG.getIntPtrConstant(0, dl));
17996 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17997 SDValue BasePtr = St->getBasePtr();
17998 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17999 St->getMemOperand());
18000 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18001 DAG.getIntPtrConstant(16, dl));
18002 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18004 SDValue BasePtrHi =
18005 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18006 DAG.getConstant(2, dl, BasePtr.getValueType()));
18008 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18009 BasePtrHi, St->getMemOperand());
18010 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18013 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18014 const X86Subtarget &Subtarget,
18015 SelectionDAG &DAG) {
18017 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18019 EVT MemVT = Ld->getMemoryVT();
18020 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18021 "Expected i1 vector load");
18022 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18023 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18024 MVT VT = Op.getValueType().getSimpleVT();
18025 unsigned NumElts = VT.getVectorNumElements();
18027 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18028 (Subtarget.hasDQI() && NumElts < 16) ||
18030 // Load and extend - everything is legal
18032 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18034 Ld->getMemOperand());
18035 // Replace chain users with the new chain.
18036 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18037 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18038 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18039 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18041 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18042 DAG.getIntPtrConstant(0, dl));
18044 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18046 Ld->getMemOperand());
18047 // Replace chain users with the new chain.
18048 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18049 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18051 // Finally, do a normal sign-extend to the desired register.
18052 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18055 if (NumElts <= 8) {
18056 // A subset, assume that we have only AVX-512F
18057 unsigned NumBitsToLoad = 8;
18058 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18059 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18061 Ld->getMemOperand());
18062 // Replace chain users with the new chain.
18063 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18064 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18066 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18067 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18070 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18072 // we should take care to v4i1 and v2i1
18074 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18075 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18076 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18077 DAG.getIntPtrConstant(0, dl));
18080 assert(VT == MVT::v32i8 && "Unexpected extload type");
18082 SmallVector<SDValue, 2> Chains;
18084 SDValue BasePtr = Ld->getBasePtr();
18085 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18087 Ld->getMemOperand());
18088 Chains.push_back(LoadLo.getValue(1));
18090 SDValue BasePtrHi =
18091 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18092 DAG.getConstant(2, dl, BasePtr.getValueType()));
18094 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18096 Ld->getMemOperand());
18097 Chains.push_back(LoadHi.getValue(1));
18098 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18099 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18101 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18102 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18103 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18106 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18107 // may emit an illegal shuffle but the expansion is still better than scalar
18108 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18109 // we'll emit a shuffle and a arithmetic shift.
18110 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18111 // TODO: It is possible to support ZExt by zeroing the undef values during
18112 // the shuffle phase or after the shuffle.
18113 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18114 SelectionDAG &DAG) {
18115 MVT RegVT = Op.getSimpleValueType();
18116 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18117 assert(RegVT.isInteger() &&
18118 "We only custom lower integer vector sext loads.");
18120 // Nothing useful we can do without SSE2 shuffles.
18121 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18123 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18125 EVT MemVT = Ld->getMemoryVT();
18126 if (MemVT.getScalarType() == MVT::i1)
18127 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18129 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18130 unsigned RegSz = RegVT.getSizeInBits();
18132 ISD::LoadExtType Ext = Ld->getExtensionType();
18134 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18135 && "Only anyext and sext are currently implemented.");
18136 assert(MemVT != RegVT && "Cannot extend to the same type");
18137 assert(MemVT.isVector() && "Must load a vector from memory");
18139 unsigned NumElems = RegVT.getVectorNumElements();
18140 unsigned MemSz = MemVT.getSizeInBits();
18141 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18143 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18144 // The only way in which we have a legal 256-bit vector result but not the
18145 // integer 256-bit operations needed to directly lower a sextload is if we
18146 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18147 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18148 // correctly legalized. We do this late to allow the canonical form of
18149 // sextload to persist throughout the rest of the DAG combiner -- it wants
18150 // to fold together any extensions it can, and so will fuse a sign_extend
18151 // of an sextload into a sextload targeting a wider value.
18153 if (MemSz == 128) {
18154 // Just switch this to a normal load.
18155 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18156 "it must be a legal 128-bit vector "
18158 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18159 Ld->getPointerInfo(), Ld->getAlignment(),
18160 Ld->getMemOperand()->getFlags());
18162 assert(MemSz < 128 &&
18163 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18164 // Do an sext load to a 128-bit vector type. We want to use the same
18165 // number of elements, but elements half as wide. This will end up being
18166 // recursively lowered by this routine, but will succeed as we definitely
18167 // have all the necessary features if we're using AVX1.
18169 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18170 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18172 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18173 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18174 Ld->getMemOperand()->getFlags());
18177 // Replace chain users with the new chain.
18178 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18179 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18181 // Finally, do a normal sign-extend to the desired register.
18182 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18185 // All sizes must be a power of two.
18186 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18187 "Non-power-of-two elements are not custom lowered!");
18189 // Attempt to load the original value using scalar loads.
18190 // Find the largest scalar type that divides the total loaded size.
18191 MVT SclrLoadTy = MVT::i8;
18192 for (MVT Tp : MVT::integer_valuetypes()) {
18193 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18198 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18199 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18201 SclrLoadTy = MVT::f64;
18203 // Calculate the number of scalar loads that we need to perform
18204 // in order to load our vector from memory.
18205 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18207 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18208 "Can only lower sext loads with a single scalar load!");
18210 unsigned loadRegZize = RegSz;
18211 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18214 // Represent our vector as a sequence of elements which are the
18215 // largest scalar that we can load.
18216 EVT LoadUnitVecVT = EVT::getVectorVT(
18217 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18219 // Represent the data using the same element type that is stored in
18220 // memory. In practice, we ''widen'' MemVT.
18222 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18223 loadRegZize / MemVT.getScalarSizeInBits());
18225 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18226 "Invalid vector type");
18228 // We can't shuffle using an illegal type.
18229 assert(TLI.isTypeLegal(WideVecVT) &&
18230 "We only lower types that form legal widened vector types");
18232 SmallVector<SDValue, 8> Chains;
18233 SDValue Ptr = Ld->getBasePtr();
18234 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18235 TLI.getPointerTy(DAG.getDataLayout()));
18236 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18238 for (unsigned i = 0; i < NumLoads; ++i) {
18239 // Perform a single load.
18240 SDValue ScalarLoad =
18241 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18242 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18243 Chains.push_back(ScalarLoad.getValue(1));
18244 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18245 // another round of DAGCombining.
18247 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18249 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18250 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18252 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18255 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18257 // Bitcast the loaded value to a vector of the original element type, in
18258 // the size of the target vector type.
18259 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18260 unsigned SizeRatio = RegSz / MemSz;
18262 if (Ext == ISD::SEXTLOAD) {
18263 // If we have SSE4.1, we can directly emit a VSEXT node.
18264 if (Subtarget.hasSSE41()) {
18265 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18266 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18270 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18272 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18273 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18275 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18276 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18280 // Redistribute the loaded elements into the different locations.
18281 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18282 for (unsigned i = 0; i != NumElems; ++i)
18283 ShuffleVec[i * SizeRatio] = i;
18285 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18286 DAG.getUNDEF(WideVecVT), ShuffleVec);
18288 // Bitcast to the requested type.
18289 Shuff = DAG.getBitcast(RegVT, Shuff);
18290 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18294 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18295 /// each of which has no other use apart from the AND / OR.
18296 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18297 Opc = Op.getOpcode();
18298 if (Opc != ISD::OR && Opc != ISD::AND)
18300 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18301 Op.getOperand(0).hasOneUse() &&
18302 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18303 Op.getOperand(1).hasOneUse());
18306 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18307 /// SETCC node has a single use.
18308 static bool isXor1OfSetCC(SDValue Op) {
18309 if (Op.getOpcode() != ISD::XOR)
18311 if (isOneConstant(Op.getOperand(1)))
18312 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18313 Op.getOperand(0).hasOneUse();
18317 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18318 bool addTest = true;
18319 SDValue Chain = Op.getOperand(0);
18320 SDValue Cond = Op.getOperand(1);
18321 SDValue Dest = Op.getOperand(2);
18324 bool Inverted = false;
18326 if (Cond.getOpcode() == ISD::SETCC) {
18327 // Check for setcc([su]{add,sub,mul}o == 0).
18328 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18329 isNullConstant(Cond.getOperand(1)) &&
18330 Cond.getOperand(0).getResNo() == 1 &&
18331 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18332 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18333 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18334 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18335 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18336 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18338 Cond = Cond.getOperand(0);
18340 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18345 // FIXME: LowerXALUO doesn't handle these!!
18346 else if (Cond.getOpcode() == X86ISD::ADD ||
18347 Cond.getOpcode() == X86ISD::SUB ||
18348 Cond.getOpcode() == X86ISD::SMUL ||
18349 Cond.getOpcode() == X86ISD::UMUL)
18350 Cond = LowerXALUO(Cond, DAG);
18353 // Look pass (and (setcc_carry (cmp ...)), 1).
18354 if (Cond.getOpcode() == ISD::AND &&
18355 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18356 isOneConstant(Cond.getOperand(1)))
18357 Cond = Cond.getOperand(0);
18359 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18360 // setting operand in place of the X86ISD::SETCC.
18361 unsigned CondOpcode = Cond.getOpcode();
18362 if (CondOpcode == X86ISD::SETCC ||
18363 CondOpcode == X86ISD::SETCC_CARRY) {
18364 CC = Cond.getOperand(0);
18366 SDValue Cmp = Cond.getOperand(1);
18367 unsigned Opc = Cmp.getOpcode();
18368 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18369 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18373 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18377 // These can only come from an arithmetic instruction with overflow,
18378 // e.g. SADDO, UADDO.
18379 Cond = Cond.getOperand(1);
18385 CondOpcode = Cond.getOpcode();
18386 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18387 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18388 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18389 Cond.getOperand(0).getValueType() != MVT::i8)) {
18390 SDValue LHS = Cond.getOperand(0);
18391 SDValue RHS = Cond.getOperand(1);
18392 unsigned X86Opcode;
18395 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18396 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18398 switch (CondOpcode) {
18399 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18401 if (isOneConstant(RHS)) {
18402 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18405 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18406 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18408 if (isOneConstant(RHS)) {
18409 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18412 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18413 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18414 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18415 default: llvm_unreachable("unexpected overflowing operator");
18418 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18419 if (CondOpcode == ISD::UMULO)
18420 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18423 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18425 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18427 if (CondOpcode == ISD::UMULO)
18428 Cond = X86Op.getValue(2);
18430 Cond = X86Op.getValue(1);
18432 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18436 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18437 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18438 if (CondOpc == ISD::OR) {
18439 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18440 // two branches instead of an explicit OR instruction with a
18442 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18443 isX86LogicalCmp(Cmp)) {
18444 CC = Cond.getOperand(0).getOperand(0);
18445 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18446 Chain, Dest, CC, Cmp);
18447 CC = Cond.getOperand(1).getOperand(0);
18451 } else { // ISD::AND
18452 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18453 // two branches instead of an explicit AND instruction with a
18454 // separate test. However, we only do this if this block doesn't
18455 // have a fall-through edge, because this requires an explicit
18456 // jmp when the condition is false.
18457 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18458 isX86LogicalCmp(Cmp) &&
18459 Op.getNode()->hasOneUse()) {
18460 X86::CondCode CCode =
18461 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18462 CCode = X86::GetOppositeBranchCondition(CCode);
18463 CC = DAG.getConstant(CCode, dl, MVT::i8);
18464 SDNode *User = *Op.getNode()->use_begin();
18465 // Look for an unconditional branch following this conditional branch.
18466 // We need this because we need to reverse the successors in order
18467 // to implement FCMP_OEQ.
18468 if (User->getOpcode() == ISD::BR) {
18469 SDValue FalseBB = User->getOperand(1);
18471 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18472 assert(NewBR == User);
18476 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18477 Chain, Dest, CC, Cmp);
18478 X86::CondCode CCode =
18479 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18480 CCode = X86::GetOppositeBranchCondition(CCode);
18481 CC = DAG.getConstant(CCode, dl, MVT::i8);
18487 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18488 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18489 // It should be transformed during dag combiner except when the condition
18490 // is set by a arithmetics with overflow node.
18491 X86::CondCode CCode =
18492 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18493 CCode = X86::GetOppositeBranchCondition(CCode);
18494 CC = DAG.getConstant(CCode, dl, MVT::i8);
18495 Cond = Cond.getOperand(0).getOperand(1);
18497 } else if (Cond.getOpcode() == ISD::SETCC &&
18498 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18499 // For FCMP_OEQ, we can emit
18500 // two branches instead of an explicit AND instruction with a
18501 // separate test. However, we only do this if this block doesn't
18502 // have a fall-through edge, because this requires an explicit
18503 // jmp when the condition is false.
18504 if (Op.getNode()->hasOneUse()) {
18505 SDNode *User = *Op.getNode()->use_begin();
18506 // Look for an unconditional branch following this conditional branch.
18507 // We need this because we need to reverse the successors in order
18508 // to implement FCMP_OEQ.
18509 if (User->getOpcode() == ISD::BR) {
18510 SDValue FalseBB = User->getOperand(1);
18512 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18513 assert(NewBR == User);
18517 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18518 Cond.getOperand(0), Cond.getOperand(1));
18519 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18520 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18521 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18522 Chain, Dest, CC, Cmp);
18523 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18528 } else if (Cond.getOpcode() == ISD::SETCC &&
18529 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18530 // For FCMP_UNE, we can emit
18531 // two branches instead of an explicit AND instruction with a
18532 // separate test. However, we only do this if this block doesn't
18533 // have a fall-through edge, because this requires an explicit
18534 // jmp when the condition is false.
18535 if (Op.getNode()->hasOneUse()) {
18536 SDNode *User = *Op.getNode()->use_begin();
18537 // Look for an unconditional branch following this conditional branch.
18538 // We need this because we need to reverse the successors in order
18539 // to implement FCMP_UNE.
18540 if (User->getOpcode() == ISD::BR) {
18541 SDValue FalseBB = User->getOperand(1);
18543 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18544 assert(NewBR == User);
18547 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18548 Cond.getOperand(0), Cond.getOperand(1));
18549 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18550 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18551 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18552 Chain, Dest, CC, Cmp);
18553 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18563 // Look pass the truncate if the high bits are known zero.
18564 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18565 Cond = Cond.getOperand(0);
18567 // We know the result is compared against zero. Try to match it to BT.
18568 if (Cond.hasOneUse()) {
18569 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18570 CC = NewSetCC.getOperand(0);
18571 Cond = NewSetCC.getOperand(1);
18578 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18579 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18580 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18582 Cond = ConvertCmpIfNecessary(Cond, DAG);
18583 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18584 Chain, Dest, CC, Cond);
18587 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18588 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18589 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18590 // that the guard pages used by the OS virtual memory manager are allocated in
18591 // correct sequence.
18593 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18594 SelectionDAG &DAG) const {
18595 MachineFunction &MF = DAG.getMachineFunction();
18596 bool SplitStack = MF.shouldSplitStack();
18597 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18602 SDNode *Node = Op.getNode();
18603 SDValue Chain = Op.getOperand(0);
18604 SDValue Size = Op.getOperand(1);
18605 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18606 EVT VT = Node->getValueType(0);
18608 // Chain the dynamic stack allocation so that it doesn't modify the stack
18609 // pointer when other instructions are using the stack.
18610 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18612 bool Is64Bit = Subtarget.is64Bit();
18613 MVT SPTy = getPointerTy(DAG.getDataLayout());
18617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18618 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18619 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18620 " not tell us which reg is the stack pointer!");
18622 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18623 Chain = SP.getValue(1);
18624 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18625 unsigned StackAlign = TFI.getStackAlignment();
18626 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18627 if (Align > StackAlign)
18628 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18629 DAG.getConstant(-(uint64_t)Align, dl, VT));
18630 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18631 } else if (SplitStack) {
18632 MachineRegisterInfo &MRI = MF.getRegInfo();
18635 // The 64 bit implementation of segmented stacks needs to clobber both r10
18636 // r11. This makes it impossible to use it along with nested parameters.
18637 const Function *F = MF.getFunction();
18638 for (const auto &A : F->args()) {
18639 if (A.hasNestAttr())
18640 report_fatal_error("Cannot use segmented stacks with functions that "
18641 "have nested arguments.");
18645 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18646 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18647 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18648 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18649 DAG.getRegister(Vreg, SPTy));
18651 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18652 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18653 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18655 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18656 unsigned SPReg = RegInfo->getStackRegister();
18657 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18658 Chain = SP.getValue(1);
18661 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18662 DAG.getConstant(-(uint64_t)Align, dl, VT));
18663 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18669 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18670 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18672 SDValue Ops[2] = {Result, Chain};
18673 return DAG.getMergeValues(Ops, dl);
18676 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18677 MachineFunction &MF = DAG.getMachineFunction();
18678 auto PtrVT = getPointerTy(MF.getDataLayout());
18679 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18681 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18684 if (!Subtarget.is64Bit() ||
18685 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18686 // vastart just stores the address of the VarArgsFrameIndex slot into the
18687 // memory location argument.
18688 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18689 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18690 MachinePointerInfo(SV));
18694 // gp_offset (0 - 6 * 8)
18695 // fp_offset (48 - 48 + 8 * 16)
18696 // overflow_arg_area (point to parameters coming in memory).
18698 SmallVector<SDValue, 8> MemOps;
18699 SDValue FIN = Op.getOperand(1);
18701 SDValue Store = DAG.getStore(
18702 Op.getOperand(0), DL,
18703 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18704 MachinePointerInfo(SV));
18705 MemOps.push_back(Store);
18708 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18709 Store = DAG.getStore(
18710 Op.getOperand(0), DL,
18711 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18712 MachinePointerInfo(SV, 4));
18713 MemOps.push_back(Store);
18715 // Store ptr to overflow_arg_area
18716 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18717 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18719 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18720 MemOps.push_back(Store);
18722 // Store ptr to reg_save_area.
18723 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18724 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18725 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18726 Store = DAG.getStore(
18727 Op.getOperand(0), DL, RSFIN, FIN,
18728 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18729 MemOps.push_back(Store);
18730 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18733 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18734 assert(Subtarget.is64Bit() &&
18735 "LowerVAARG only handles 64-bit va_arg!");
18736 assert(Op.getNumOperands() == 4);
18738 MachineFunction &MF = DAG.getMachineFunction();
18739 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18740 // The Win64 ABI uses char* instead of a structure.
18741 return DAG.expandVAArg(Op.getNode());
18743 SDValue Chain = Op.getOperand(0);
18744 SDValue SrcPtr = Op.getOperand(1);
18745 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18746 unsigned Align = Op.getConstantOperandVal(3);
18749 EVT ArgVT = Op.getNode()->getValueType(0);
18750 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18751 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18754 // Decide which area this value should be read from.
18755 // TODO: Implement the AMD64 ABI in its entirety. This simple
18756 // selection mechanism works only for the basic types.
18757 if (ArgVT == MVT::f80) {
18758 llvm_unreachable("va_arg for f80 not yet implemented");
18759 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18760 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18761 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18762 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18764 llvm_unreachable("Unhandled argument type in LowerVAARG");
18767 if (ArgMode == 2) {
18768 // Sanity Check: Make sure using fp_offset makes sense.
18769 assert(!Subtarget.useSoftFloat() &&
18770 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18771 Subtarget.hasSSE1());
18774 // Insert VAARG_64 node into the DAG
18775 // VAARG_64 returns two values: Variable Argument Address, Chain
18776 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18777 DAG.getConstant(ArgMode, dl, MVT::i8),
18778 DAG.getConstant(Align, dl, MVT::i32)};
18779 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18780 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18781 VTs, InstOps, MVT::i64,
18782 MachinePointerInfo(SV),
18784 /*Volatile=*/false,
18786 /*WriteMem=*/true);
18787 Chain = VAARG.getValue(1);
18789 // Load the next argument and return it
18790 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18793 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18794 SelectionDAG &DAG) {
18795 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18796 // where a va_list is still an i8*.
18797 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18798 if (Subtarget.isCallingConvWin64(
18799 DAG.getMachineFunction().getFunction()->getCallingConv()))
18800 // Probably a Win64 va_copy.
18801 return DAG.expandVACopy(Op.getNode());
18803 SDValue Chain = Op.getOperand(0);
18804 SDValue DstPtr = Op.getOperand(1);
18805 SDValue SrcPtr = Op.getOperand(2);
18806 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18807 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18810 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18811 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18813 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18816 /// Handle vector element shifts where the shift amount is a constant.
18817 /// Takes immediate version of shift as input.
18818 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18819 SDValue SrcOp, uint64_t ShiftAmt,
18820 SelectionDAG &DAG) {
18821 MVT ElementType = VT.getVectorElementType();
18823 // Bitcast the source vector to the output type, this is mainly necessary for
18824 // vXi8/vXi64 shifts.
18825 if (VT != SrcOp.getSimpleValueType())
18826 SrcOp = DAG.getBitcast(VT, SrcOp);
18828 // Fold this packed shift into its first operand if ShiftAmt is 0.
18832 // Check for ShiftAmt >= element width
18833 if (ShiftAmt >= ElementType.getSizeInBits()) {
18834 if (Opc == X86ISD::VSRAI)
18835 ShiftAmt = ElementType.getSizeInBits() - 1;
18837 return DAG.getConstant(0, dl, VT);
18840 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18841 && "Unknown target vector shift-by-constant node");
18843 // Fold this packed vector shift into a build vector if SrcOp is a
18844 // vector of Constants or UNDEFs.
18845 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18846 SmallVector<SDValue, 8> Elts;
18847 unsigned NumElts = SrcOp->getNumOperands();
18848 ConstantSDNode *ND;
18851 default: llvm_unreachable("Unknown opcode!");
18852 case X86ISD::VSHLI:
18853 for (unsigned i=0; i!=NumElts; ++i) {
18854 SDValue CurrentOp = SrcOp->getOperand(i);
18855 if (CurrentOp->isUndef()) {
18856 Elts.push_back(CurrentOp);
18859 ND = cast<ConstantSDNode>(CurrentOp);
18860 const APInt &C = ND->getAPIntValue();
18861 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18864 case X86ISD::VSRLI:
18865 for (unsigned i=0; i!=NumElts; ++i) {
18866 SDValue CurrentOp = SrcOp->getOperand(i);
18867 if (CurrentOp->isUndef()) {
18868 Elts.push_back(CurrentOp);
18871 ND = cast<ConstantSDNode>(CurrentOp);
18872 const APInt &C = ND->getAPIntValue();
18873 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18876 case X86ISD::VSRAI:
18877 for (unsigned i=0; i!=NumElts; ++i) {
18878 SDValue CurrentOp = SrcOp->getOperand(i);
18879 if (CurrentOp->isUndef()) {
18880 Elts.push_back(CurrentOp);
18883 ND = cast<ConstantSDNode>(CurrentOp);
18884 const APInt &C = ND->getAPIntValue();
18885 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18890 return DAG.getBuildVector(VT, dl, Elts);
18893 return DAG.getNode(Opc, dl, VT, SrcOp,
18894 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18897 /// Handle vector element shifts where the shift amount may or may not be a
18898 /// constant. Takes immediate version of shift as input.
18899 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18900 SDValue SrcOp, SDValue ShAmt,
18901 const X86Subtarget &Subtarget,
18902 SelectionDAG &DAG) {
18903 MVT SVT = ShAmt.getSimpleValueType();
18904 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18906 // Catch shift-by-constant.
18907 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18908 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18909 CShAmt->getZExtValue(), DAG);
18911 // Change opcode to non-immediate version
18913 default: llvm_unreachable("Unknown target vector shift node");
18914 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18915 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18916 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18919 // Need to build a vector containing shift amount.
18920 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18921 // +=================+============+=======================================+
18922 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18923 // +=================+============+=======================================+
18924 // | i64 | Yes, No | Use ShAmt as lowest elt |
18925 // | i32 | Yes | zero-extend in-reg |
18926 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18927 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18928 // +=================+============+=======================================+
18930 if (SVT == MVT::i64)
18931 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18932 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18933 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18934 ShAmt = ShAmt.getOperand(0);
18935 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18936 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18937 } else if (Subtarget.hasSSE41() &&
18938 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18939 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18940 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18942 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18943 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18944 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18947 // The return type has to be a 128-bit type with the same element
18948 // type as the input type.
18949 MVT EltVT = VT.getVectorElementType();
18950 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18952 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18953 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18956 /// \brief Return Mask with the necessary casting or extending
18957 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18958 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18959 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18962 if (isAllOnesConstant(Mask))
18963 return DAG.getTargetConstant(1, dl, MaskVT);
18964 if (X86::isZeroNode(Mask))
18965 return DAG.getTargetConstant(0, dl, MaskVT);
18967 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18968 // Mask should be extended
18969 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18970 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18973 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18974 if (MaskVT == MVT::v64i1) {
18975 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18976 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18978 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18979 DAG.getConstant(0, dl, MVT::i32));
18980 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18981 DAG.getConstant(1, dl, MVT::i32));
18983 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18984 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18986 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18988 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18990 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18991 return DAG.getBitcast(MaskVT,
18992 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18996 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18997 Mask.getSimpleValueType().getSizeInBits());
18998 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18999 // are extracted by EXTRACT_SUBVECTOR.
19000 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19001 DAG.getBitcast(BitcastVT, Mask),
19002 DAG.getIntPtrConstant(0, dl));
19006 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19007 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19008 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19009 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19010 SDValue PreservedSrc,
19011 const X86Subtarget &Subtarget,
19012 SelectionDAG &DAG) {
19013 MVT VT = Op.getSimpleValueType();
19014 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19015 unsigned OpcodeSelect = ISD::VSELECT;
19018 if (isAllOnesConstant(Mask))
19021 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19023 switch (Op.getOpcode()) {
19025 case X86ISD::PCMPEQM:
19026 case X86ISD::PCMPGTM:
19028 case X86ISD::CMPMU:
19029 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19030 case X86ISD::VFPCLASS:
19031 case X86ISD::VFPCLASSS:
19032 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19033 case X86ISD::VTRUNC:
19034 case X86ISD::VTRUNCS:
19035 case X86ISD::VTRUNCUS:
19036 case X86ISD::CVTPS2PH:
19037 // We can't use ISD::VSELECT here because it is not always "Legal"
19038 // for the destination type. For example vpmovqb require only AVX512
19039 // and vselect that can operate on byte element type require BWI
19040 OpcodeSelect = X86ISD::SELECT;
19043 if (PreservedSrc.isUndef())
19044 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19045 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19048 /// \brief Creates an SDNode for a predicated scalar operation.
19049 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19050 /// The mask is coming as MVT::i8 and it should be transformed
19051 /// to MVT::v1i1 while lowering masking intrinsics.
19052 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19053 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19054 /// for a scalar instruction.
19055 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19056 SDValue PreservedSrc,
19057 const X86Subtarget &Subtarget,
19058 SelectionDAG &DAG) {
19060 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19061 if (MaskConst->getZExtValue() & 0x1)
19064 MVT VT = Op.getSimpleValueType();
19067 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19068 if (Op.getOpcode() == X86ISD::FSETCCM ||
19069 Op.getOpcode() == X86ISD::FSETCCM_RND)
19070 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19071 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19072 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19074 if (PreservedSrc.isUndef())
19075 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19076 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19079 static int getSEHRegistrationNodeSize(const Function *Fn) {
19080 if (!Fn->hasPersonalityFn())
19081 report_fatal_error(
19082 "querying registration node size for function without personality");
19083 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19084 // WinEHStatePass for the full struct definition.
19085 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19086 case EHPersonality::MSVC_X86SEH: return 24;
19087 case EHPersonality::MSVC_CXX: return 16;
19090 report_fatal_error(
19091 "can only recover FP for 32-bit MSVC EH personality functions");
19094 /// When the MSVC runtime transfers control to us, either to an outlined
19095 /// function or when returning to a parent frame after catching an exception, we
19096 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19097 /// Here's the math:
19098 /// RegNodeBase = EntryEBP - RegNodeSize
19099 /// ParentFP = RegNodeBase - ParentFrameOffset
19100 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19101 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19102 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19103 SDValue EntryEBP) {
19104 MachineFunction &MF = DAG.getMachineFunction();
19107 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19108 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19110 // It's possible that the parent function no longer has a personality function
19111 // if the exceptional code was optimized away, in which case we just return
19112 // the incoming EBP.
19113 if (!Fn->hasPersonalityFn())
19116 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19117 // registration, or the .set_setframe offset.
19118 MCSymbol *OffsetSym =
19119 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19120 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19121 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19122 SDValue ParentFrameOffset =
19123 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19125 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19126 // prologue to RBP in the parent function.
19127 const X86Subtarget &Subtarget =
19128 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19129 if (Subtarget.is64Bit())
19130 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19132 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19133 // RegNodeBase = EntryEBP - RegNodeSize
19134 // ParentFP = RegNodeBase - ParentFrameOffset
19135 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19136 DAG.getConstant(RegNodeSize, dl, PtrVT));
19137 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19140 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19141 SelectionDAG &DAG) {
19142 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19143 auto isRoundModeCurDirection = [](SDValue Rnd) {
19144 if (!isa<ConstantSDNode>(Rnd))
19147 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19148 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19152 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19153 MVT VT = Op.getSimpleValueType();
19154 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19156 switch(IntrData->Type) {
19157 case INTR_TYPE_1OP:
19158 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19159 case INTR_TYPE_2OP:
19160 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19162 case INTR_TYPE_3OP:
19163 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19164 Op.getOperand(2), Op.getOperand(3));
19165 case INTR_TYPE_4OP:
19166 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19167 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19168 case INTR_TYPE_1OP_MASK_RM: {
19169 SDValue Src = Op.getOperand(1);
19170 SDValue PassThru = Op.getOperand(2);
19171 SDValue Mask = Op.getOperand(3);
19172 SDValue RoundingMode;
19173 // We always add rounding mode to the Node.
19174 // If the rounding mode is not specified, we add the
19175 // "current direction" mode.
19176 if (Op.getNumOperands() == 4)
19178 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19180 RoundingMode = Op.getOperand(4);
19181 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19182 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19184 Mask, PassThru, Subtarget, DAG);
19186 case INTR_TYPE_1OP_MASK: {
19187 SDValue Src = Op.getOperand(1);
19188 SDValue PassThru = Op.getOperand(2);
19189 SDValue Mask = Op.getOperand(3);
19190 // We add rounding mode to the Node when
19191 // - RM Opcode is specified and
19192 // - RM is not "current direction".
19193 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19194 if (IntrWithRoundingModeOpcode != 0) {
19195 SDValue Rnd = Op.getOperand(4);
19196 if (!isRoundModeCurDirection(Rnd)) {
19197 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19198 dl, Op.getValueType(),
19200 Mask, PassThru, Subtarget, DAG);
19203 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19204 Mask, PassThru, Subtarget, DAG);
19206 case INTR_TYPE_SCALAR_MASK: {
19207 SDValue Src1 = Op.getOperand(1);
19208 SDValue Src2 = Op.getOperand(2);
19209 SDValue passThru = Op.getOperand(3);
19210 SDValue Mask = Op.getOperand(4);
19211 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19212 if (IntrWithRoundingModeOpcode != 0) {
19213 SDValue Rnd = Op.getOperand(5);
19214 if (!isRoundModeCurDirection(Rnd))
19215 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19216 dl, VT, Src1, Src2, Rnd),
19217 Mask, passThru, Subtarget, DAG);
19219 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19220 Mask, passThru, Subtarget, DAG);
19222 case INTR_TYPE_SCALAR_MASK_RM: {
19223 SDValue Src1 = Op.getOperand(1);
19224 SDValue Src2 = Op.getOperand(2);
19225 SDValue Src0 = Op.getOperand(3);
19226 SDValue Mask = Op.getOperand(4);
19227 // There are 2 kinds of intrinsics in this group:
19228 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19229 // (2) With rounding mode and sae - 7 operands.
19230 if (Op.getNumOperands() == 6) {
19231 SDValue Sae = Op.getOperand(5);
19232 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19234 Mask, Src0, Subtarget, DAG);
19236 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19237 SDValue RoundingMode = Op.getOperand(5);
19238 SDValue Sae = Op.getOperand(6);
19239 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19240 RoundingMode, Sae),
19241 Mask, Src0, Subtarget, DAG);
19243 case INTR_TYPE_2OP_MASK:
19244 case INTR_TYPE_2OP_IMM8_MASK: {
19245 SDValue Src1 = Op.getOperand(1);
19246 SDValue Src2 = Op.getOperand(2);
19247 SDValue PassThru = Op.getOperand(3);
19248 SDValue Mask = Op.getOperand(4);
19250 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19251 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19253 // We specify 2 possible opcodes for intrinsics with rounding modes.
19254 // First, we check if the intrinsic may have non-default rounding mode,
19255 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19256 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19257 if (IntrWithRoundingModeOpcode != 0) {
19258 SDValue Rnd = Op.getOperand(5);
19259 if (!isRoundModeCurDirection(Rnd)) {
19260 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19261 dl, Op.getValueType(),
19263 Mask, PassThru, Subtarget, DAG);
19266 // TODO: Intrinsics should have fast-math-flags to propagate.
19267 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19268 Mask, PassThru, Subtarget, DAG);
19270 case INTR_TYPE_2OP_MASK_RM: {
19271 SDValue Src1 = Op.getOperand(1);
19272 SDValue Src2 = Op.getOperand(2);
19273 SDValue PassThru = Op.getOperand(3);
19274 SDValue Mask = Op.getOperand(4);
19275 // We specify 2 possible modes for intrinsics, with/without rounding
19277 // First, we check if the intrinsic have rounding mode (6 operands),
19278 // if not, we set rounding mode to "current".
19280 if (Op.getNumOperands() == 6)
19281 Rnd = Op.getOperand(5);
19283 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19284 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19286 Mask, PassThru, Subtarget, DAG);
19288 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19289 SDValue Src1 = Op.getOperand(1);
19290 SDValue Src2 = Op.getOperand(2);
19291 SDValue Src3 = Op.getOperand(3);
19292 SDValue PassThru = Op.getOperand(4);
19293 SDValue Mask = Op.getOperand(5);
19294 SDValue Sae = Op.getOperand(6);
19296 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19298 Mask, PassThru, Subtarget, DAG);
19300 case INTR_TYPE_3OP_MASK_RM: {
19301 SDValue Src1 = Op.getOperand(1);
19302 SDValue Src2 = Op.getOperand(2);
19303 SDValue Imm = Op.getOperand(3);
19304 SDValue PassThru = Op.getOperand(4);
19305 SDValue Mask = Op.getOperand(5);
19306 // We specify 2 possible modes for intrinsics, with/without rounding
19308 // First, we check if the intrinsic have rounding mode (7 operands),
19309 // if not, we set rounding mode to "current".
19311 if (Op.getNumOperands() == 7)
19312 Rnd = Op.getOperand(6);
19314 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19315 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19316 Src1, Src2, Imm, Rnd),
19317 Mask, PassThru, Subtarget, DAG);
19319 case INTR_TYPE_3OP_IMM8_MASK:
19320 case INTR_TYPE_3OP_MASK: {
19321 SDValue Src1 = Op.getOperand(1);
19322 SDValue Src2 = Op.getOperand(2);
19323 SDValue Src3 = Op.getOperand(3);
19324 SDValue PassThru = Op.getOperand(4);
19325 SDValue Mask = Op.getOperand(5);
19327 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19328 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19330 // We specify 2 possible opcodes for intrinsics with rounding modes.
19331 // First, we check if the intrinsic may have non-default rounding mode,
19332 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19333 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19334 if (IntrWithRoundingModeOpcode != 0) {
19335 SDValue Rnd = Op.getOperand(6);
19336 if (!isRoundModeCurDirection(Rnd)) {
19337 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19338 dl, Op.getValueType(),
19339 Src1, Src2, Src3, Rnd),
19340 Mask, PassThru, Subtarget, DAG);
19343 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19345 Mask, PassThru, Subtarget, DAG);
19347 case VPERM_2OP_MASK : {
19348 SDValue Src1 = Op.getOperand(1);
19349 SDValue Src2 = Op.getOperand(2);
19350 SDValue PassThru = Op.getOperand(3);
19351 SDValue Mask = Op.getOperand(4);
19353 // Swap Src1 and Src2 in the node creation
19354 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19355 Mask, PassThru, Subtarget, DAG);
19357 case VPERM_3OP_MASKZ:
19358 case VPERM_3OP_MASK:{
19359 MVT VT = Op.getSimpleValueType();
19360 // Src2 is the PassThru
19361 SDValue Src1 = Op.getOperand(1);
19362 // PassThru needs to be the same type as the destination in order
19363 // to pattern match correctly.
19364 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19365 SDValue Src3 = Op.getOperand(3);
19366 SDValue Mask = Op.getOperand(4);
19367 SDValue PassThru = SDValue();
19369 // set PassThru element
19370 if (IntrData->Type == VPERM_3OP_MASKZ)
19371 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19375 // Swap Src1 and Src2 in the node creation
19376 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19377 dl, Op.getValueType(),
19379 Mask, PassThru, Subtarget, DAG);
19383 case FMA_OP_MASK: {
19384 SDValue Src1 = Op.getOperand(1);
19385 SDValue Src2 = Op.getOperand(2);
19386 SDValue Src3 = Op.getOperand(3);
19387 SDValue Mask = Op.getOperand(4);
19388 MVT VT = Op.getSimpleValueType();
19389 SDValue PassThru = SDValue();
19391 // set PassThru element
19392 if (IntrData->Type == FMA_OP_MASKZ)
19393 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19394 else if (IntrData->Type == FMA_OP_MASK3)
19399 // We specify 2 possible opcodes for intrinsics with rounding modes.
19400 // First, we check if the intrinsic may have non-default rounding mode,
19401 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19402 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19403 if (IntrWithRoundingModeOpcode != 0) {
19404 SDValue Rnd = Op.getOperand(5);
19405 if (!isRoundModeCurDirection(Rnd))
19406 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19407 dl, Op.getValueType(),
19408 Src1, Src2, Src3, Rnd),
19409 Mask, PassThru, Subtarget, DAG);
19411 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19412 dl, Op.getValueType(),
19414 Mask, PassThru, Subtarget, DAG);
19416 case FMA_OP_SCALAR_MASK:
19417 case FMA_OP_SCALAR_MASK3:
19418 case FMA_OP_SCALAR_MASKZ: {
19419 SDValue Src1 = Op.getOperand(1);
19420 SDValue Src2 = Op.getOperand(2);
19421 SDValue Src3 = Op.getOperand(3);
19422 SDValue Mask = Op.getOperand(4);
19423 MVT VT = Op.getSimpleValueType();
19424 SDValue PassThru = SDValue();
19426 // set PassThru element
19427 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19428 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19429 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19434 SDValue Rnd = Op.getOperand(5);
19435 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19436 Op.getValueType(), Src1, Src2,
19438 Mask, PassThru, Subtarget, DAG);
19440 case TERLOG_OP_MASK:
19441 case TERLOG_OP_MASKZ: {
19442 SDValue Src1 = Op.getOperand(1);
19443 SDValue Src2 = Op.getOperand(2);
19444 SDValue Src3 = Op.getOperand(3);
19445 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19446 SDValue Mask = Op.getOperand(5);
19447 MVT VT = Op.getSimpleValueType();
19448 SDValue PassThru = Src1;
19449 // Set PassThru element.
19450 if (IntrData->Type == TERLOG_OP_MASKZ)
19451 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19453 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19454 Src1, Src2, Src3, Src4),
19455 Mask, PassThru, Subtarget, DAG);
19458 // ISD::FP_ROUND has a second argument that indicates if the truncation
19459 // does not change the value. Set it to 0 since it can change.
19460 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19461 DAG.getIntPtrConstant(0, dl));
19462 case CVTPD2PS_MASK: {
19463 SDValue Src = Op.getOperand(1);
19464 SDValue PassThru = Op.getOperand(2);
19465 SDValue Mask = Op.getOperand(3);
19466 // We add rounding mode to the Node when
19467 // - RM Opcode is specified and
19468 // - RM is not "current direction".
19469 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19470 if (IntrWithRoundingModeOpcode != 0) {
19471 SDValue Rnd = Op.getOperand(4);
19472 if (!isRoundModeCurDirection(Rnd)) {
19473 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19474 dl, Op.getValueType(),
19476 Mask, PassThru, Subtarget, DAG);
19479 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19480 // ISD::FP_ROUND has a second argument that indicates if the truncation
19481 // does not change the value. Set it to 0 since it can change.
19482 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19483 DAG.getIntPtrConstant(0, dl)),
19484 Mask, PassThru, Subtarget, DAG);
19487 // FPclass intrinsics with mask
19488 SDValue Src1 = Op.getOperand(1);
19489 MVT VT = Src1.getSimpleValueType();
19490 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19491 SDValue Imm = Op.getOperand(2);
19492 SDValue Mask = Op.getOperand(3);
19493 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19494 Mask.getSimpleValueType().getSizeInBits());
19495 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19496 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19497 DAG.getTargetConstant(0, dl, MaskVT),
19499 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19500 DAG.getUNDEF(BitcastVT), FPclassMask,
19501 DAG.getIntPtrConstant(0, dl));
19502 return DAG.getBitcast(Op.getValueType(), Res);
19505 SDValue Src1 = Op.getOperand(1);
19506 SDValue Imm = Op.getOperand(2);
19507 SDValue Mask = Op.getOperand(3);
19508 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19509 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19510 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19511 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19512 DAG.getIntPtrConstant(0, dl));
19515 case CMP_MASK_CC: {
19516 // Comparison intrinsics with masks.
19517 // Example of transformation:
19518 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19519 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19521 // (v8i1 (insert_subvector undef,
19522 // (v2i1 (and (PCMPEQM %a, %b),
19523 // (extract_subvector
19524 // (v8i1 (bitcast %mask)), 0))), 0))))
19525 MVT VT = Op.getOperand(1).getSimpleValueType();
19526 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19527 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19528 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19529 Mask.getSimpleValueType().getSizeInBits());
19531 if (IntrData->Type == CMP_MASK_CC) {
19532 SDValue CC = Op.getOperand(3);
19533 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19534 // We specify 2 possible opcodes for intrinsics with rounding modes.
19535 // First, we check if the intrinsic may have non-default rounding mode,
19536 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19537 if (IntrData->Opc1 != 0) {
19538 SDValue Rnd = Op.getOperand(5);
19539 if (!isRoundModeCurDirection(Rnd))
19540 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19541 Op.getOperand(2), CC, Rnd);
19543 //default rounding mode
19545 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19546 Op.getOperand(2), CC);
19549 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19550 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19553 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19554 DAG.getTargetConstant(0, dl,
19557 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19558 DAG.getUNDEF(BitcastVT), CmpMask,
19559 DAG.getIntPtrConstant(0, dl));
19560 return DAG.getBitcast(Op.getValueType(), Res);
19562 case CMP_MASK_SCALAR_CC: {
19563 SDValue Src1 = Op.getOperand(1);
19564 SDValue Src2 = Op.getOperand(2);
19565 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19566 SDValue Mask = Op.getOperand(4);
19569 if (IntrData->Opc1 != 0) {
19570 SDValue Rnd = Op.getOperand(5);
19571 if (!isRoundModeCurDirection(Rnd))
19572 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19574 //default rounding mode
19576 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19578 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19579 DAG.getTargetConstant(0, dl,
19582 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19583 DAG.getIntPtrConstant(0, dl));
19585 case COMI: { // Comparison intrinsics
19586 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19587 SDValue LHS = Op.getOperand(1);
19588 SDValue RHS = Op.getOperand(2);
19589 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19590 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19593 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19594 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19595 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19596 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19599 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19600 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19601 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19602 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19605 case ISD::SETGT: // (CF = 0 and ZF = 0)
19606 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19608 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19609 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19612 case ISD::SETGE: // CF = 0
19613 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19615 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19616 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19619 llvm_unreachable("Unexpected illegal condition!");
19621 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19623 case COMI_RM: { // Comparison intrinsics with Sae
19624 SDValue LHS = Op.getOperand(1);
19625 SDValue RHS = Op.getOperand(2);
19626 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19627 SDValue Sae = Op.getOperand(4);
19630 if (isRoundModeCurDirection(Sae))
19631 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19632 DAG.getConstant(CondVal, dl, MVT::i8));
19634 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19635 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19636 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19637 DAG.getIntPtrConstant(0, dl));
19640 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19641 Op.getOperand(1), Op.getOperand(2), Subtarget,
19643 case COMPRESS_EXPAND_IN_REG: {
19644 SDValue Mask = Op.getOperand(3);
19645 SDValue DataToCompress = Op.getOperand(1);
19646 SDValue PassThru = Op.getOperand(2);
19647 if (isAllOnesConstant(Mask)) // return data as is
19648 return Op.getOperand(1);
19650 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19652 Mask, PassThru, Subtarget, DAG);
19655 SDValue Mask = Op.getOperand(1);
19656 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19657 Mask.getSimpleValueType().getSizeInBits());
19658 Mask = DAG.getBitcast(MaskVT, Mask);
19659 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19662 MVT VT = Op.getSimpleValueType();
19663 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19665 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19666 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19667 // Arguments should be swapped.
19668 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19669 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19671 return DAG.getBitcast(VT, Res);
19674 MVT VT = Op.getSimpleValueType();
19675 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19677 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19678 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19679 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19680 return DAG.getBitcast(VT, Res);
19683 case FIXUPIMMS_MASKZ:
19685 case FIXUPIMM_MASKZ:{
19686 SDValue Src1 = Op.getOperand(1);
19687 SDValue Src2 = Op.getOperand(2);
19688 SDValue Src3 = Op.getOperand(3);
19689 SDValue Imm = Op.getOperand(4);
19690 SDValue Mask = Op.getOperand(5);
19691 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19692 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19693 // We specify 2 possible modes for intrinsics, with/without rounding
19695 // First, we check if the intrinsic have rounding mode (7 operands),
19696 // if not, we set rounding mode to "current".
19698 if (Op.getNumOperands() == 7)
19699 Rnd = Op.getOperand(6);
19701 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19702 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19703 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19704 Src1, Src2, Src3, Imm, Rnd),
19705 Mask, Passthru, Subtarget, DAG);
19706 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19707 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19708 Src1, Src2, Src3, Imm, Rnd),
19709 Mask, Passthru, Subtarget, DAG);
19711 case CONVERT_TO_MASK: {
19712 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19713 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19714 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19716 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19718 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19719 DAG.getUNDEF(BitcastVT), CvtMask,
19720 DAG.getIntPtrConstant(0, dl));
19721 return DAG.getBitcast(Op.getValueType(), Res);
19723 case BRCST_SUBVEC_TO_VEC: {
19724 SDValue Src = Op.getOperand(1);
19725 SDValue Passthru = Op.getOperand(2);
19726 SDValue Mask = Op.getOperand(3);
19727 EVT resVT = Passthru.getValueType();
19728 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19729 DAG.getUNDEF(resVT), Src,
19730 DAG.getIntPtrConstant(0, dl));
19732 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19733 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19735 immVal = DAG.getConstant(0, dl, MVT::i8);
19736 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19737 subVec, subVec, immVal),
19738 Mask, Passthru, Subtarget, DAG);
19740 case BRCST32x2_TO_VEC: {
19741 SDValue Src = Op.getOperand(1);
19742 SDValue PassThru = Op.getOperand(2);
19743 SDValue Mask = Op.getOperand(3);
19745 assert((VT.getScalarType() == MVT::i32 ||
19746 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19747 //bitcast Src to packed 64
19748 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19749 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19750 Src = DAG.getBitcast(BitcastVT, Src);
19752 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19753 Mask, PassThru, Subtarget, DAG);
19761 default: return SDValue(); // Don't custom lower most intrinsics.
19763 case Intrinsic::x86_avx2_permd:
19764 case Intrinsic::x86_avx2_permps:
19765 // Operands intentionally swapped. Mask is last operand to intrinsic,
19766 // but second operand for node/instruction.
19767 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19768 Op.getOperand(2), Op.getOperand(1));
19770 // ptest and testp intrinsics. The intrinsic these come from are designed to
19771 // return an integer value, not just an instruction so lower it to the ptest
19772 // or testp pattern and a setcc for the result.
19773 case Intrinsic::x86_sse41_ptestz:
19774 case Intrinsic::x86_sse41_ptestc:
19775 case Intrinsic::x86_sse41_ptestnzc:
19776 case Intrinsic::x86_avx_ptestz_256:
19777 case Intrinsic::x86_avx_ptestc_256:
19778 case Intrinsic::x86_avx_ptestnzc_256:
19779 case Intrinsic::x86_avx_vtestz_ps:
19780 case Intrinsic::x86_avx_vtestc_ps:
19781 case Intrinsic::x86_avx_vtestnzc_ps:
19782 case Intrinsic::x86_avx_vtestz_pd:
19783 case Intrinsic::x86_avx_vtestc_pd:
19784 case Intrinsic::x86_avx_vtestnzc_pd:
19785 case Intrinsic::x86_avx_vtestz_ps_256:
19786 case Intrinsic::x86_avx_vtestc_ps_256:
19787 case Intrinsic::x86_avx_vtestnzc_ps_256:
19788 case Intrinsic::x86_avx_vtestz_pd_256:
19789 case Intrinsic::x86_avx_vtestc_pd_256:
19790 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19791 bool IsTestPacked = false;
19792 X86::CondCode X86CC;
19794 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19795 case Intrinsic::x86_avx_vtestz_ps:
19796 case Intrinsic::x86_avx_vtestz_pd:
19797 case Intrinsic::x86_avx_vtestz_ps_256:
19798 case Intrinsic::x86_avx_vtestz_pd_256:
19799 IsTestPacked = true;
19801 case Intrinsic::x86_sse41_ptestz:
19802 case Intrinsic::x86_avx_ptestz_256:
19804 X86CC = X86::COND_E;
19806 case Intrinsic::x86_avx_vtestc_ps:
19807 case Intrinsic::x86_avx_vtestc_pd:
19808 case Intrinsic::x86_avx_vtestc_ps_256:
19809 case Intrinsic::x86_avx_vtestc_pd_256:
19810 IsTestPacked = true;
19812 case Intrinsic::x86_sse41_ptestc:
19813 case Intrinsic::x86_avx_ptestc_256:
19815 X86CC = X86::COND_B;
19817 case Intrinsic::x86_avx_vtestnzc_ps:
19818 case Intrinsic::x86_avx_vtestnzc_pd:
19819 case Intrinsic::x86_avx_vtestnzc_ps_256:
19820 case Intrinsic::x86_avx_vtestnzc_pd_256:
19821 IsTestPacked = true;
19823 case Intrinsic::x86_sse41_ptestnzc:
19824 case Intrinsic::x86_avx_ptestnzc_256:
19826 X86CC = X86::COND_A;
19830 SDValue LHS = Op.getOperand(1);
19831 SDValue RHS = Op.getOperand(2);
19832 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19833 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19834 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19835 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19837 case Intrinsic::x86_avx512_kortestz_w:
19838 case Intrinsic::x86_avx512_kortestc_w: {
19839 X86::CondCode X86CC =
19840 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19841 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19842 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19843 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19844 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19845 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19848 case Intrinsic::x86_avx512_knot_w: {
19849 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19850 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19851 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19852 return DAG.getBitcast(MVT::i16, Res);
19855 case Intrinsic::x86_avx512_kandn_w: {
19856 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19857 // Invert LHS for the not.
19858 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19859 DAG.getConstant(1, dl, MVT::v16i1));
19860 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19861 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19862 return DAG.getBitcast(MVT::i16, Res);
19865 case Intrinsic::x86_avx512_kxnor_w: {
19866 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19867 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19868 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19869 // Invert result for the not.
19870 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19871 DAG.getConstant(1, dl, MVT::v16i1));
19872 return DAG.getBitcast(MVT::i16, Res);
19875 case Intrinsic::x86_sse42_pcmpistria128:
19876 case Intrinsic::x86_sse42_pcmpestria128:
19877 case Intrinsic::x86_sse42_pcmpistric128:
19878 case Intrinsic::x86_sse42_pcmpestric128:
19879 case Intrinsic::x86_sse42_pcmpistrio128:
19880 case Intrinsic::x86_sse42_pcmpestrio128:
19881 case Intrinsic::x86_sse42_pcmpistris128:
19882 case Intrinsic::x86_sse42_pcmpestris128:
19883 case Intrinsic::x86_sse42_pcmpistriz128:
19884 case Intrinsic::x86_sse42_pcmpestriz128: {
19886 X86::CondCode X86CC;
19888 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19889 case Intrinsic::x86_sse42_pcmpistria128:
19890 Opcode = X86ISD::PCMPISTRI;
19891 X86CC = X86::COND_A;
19893 case Intrinsic::x86_sse42_pcmpestria128:
19894 Opcode = X86ISD::PCMPESTRI;
19895 X86CC = X86::COND_A;
19897 case Intrinsic::x86_sse42_pcmpistric128:
19898 Opcode = X86ISD::PCMPISTRI;
19899 X86CC = X86::COND_B;
19901 case Intrinsic::x86_sse42_pcmpestric128:
19902 Opcode = X86ISD::PCMPESTRI;
19903 X86CC = X86::COND_B;
19905 case Intrinsic::x86_sse42_pcmpistrio128:
19906 Opcode = X86ISD::PCMPISTRI;
19907 X86CC = X86::COND_O;
19909 case Intrinsic::x86_sse42_pcmpestrio128:
19910 Opcode = X86ISD::PCMPESTRI;
19911 X86CC = X86::COND_O;
19913 case Intrinsic::x86_sse42_pcmpistris128:
19914 Opcode = X86ISD::PCMPISTRI;
19915 X86CC = X86::COND_S;
19917 case Intrinsic::x86_sse42_pcmpestris128:
19918 Opcode = X86ISD::PCMPESTRI;
19919 X86CC = X86::COND_S;
19921 case Intrinsic::x86_sse42_pcmpistriz128:
19922 Opcode = X86ISD::PCMPISTRI;
19923 X86CC = X86::COND_E;
19925 case Intrinsic::x86_sse42_pcmpestriz128:
19926 Opcode = X86ISD::PCMPESTRI;
19927 X86CC = X86::COND_E;
19930 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19931 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19932 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19933 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19934 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19937 case Intrinsic::x86_sse42_pcmpistri128:
19938 case Intrinsic::x86_sse42_pcmpestri128: {
19940 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19941 Opcode = X86ISD::PCMPISTRI;
19943 Opcode = X86ISD::PCMPESTRI;
19945 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19946 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19947 return DAG.getNode(Opcode, dl, VTs, NewOps);
19950 case Intrinsic::eh_sjlj_lsda: {
19951 MachineFunction &MF = DAG.getMachineFunction();
19952 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19953 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19954 auto &Context = MF.getMMI().getContext();
19955 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19956 Twine(MF.getFunctionNumber()));
19957 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19960 case Intrinsic::x86_seh_lsda: {
19961 // Compute the symbol for the LSDA. We know it'll get emitted later.
19962 MachineFunction &MF = DAG.getMachineFunction();
19963 SDValue Op1 = Op.getOperand(1);
19964 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19965 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19966 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19968 // Generate a simple absolute symbol reference. This intrinsic is only
19969 // supported on 32-bit Windows, which isn't PIC.
19970 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19971 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19974 case Intrinsic::x86_seh_recoverfp: {
19975 SDValue FnOp = Op.getOperand(1);
19976 SDValue IncomingFPOp = Op.getOperand(2);
19977 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19978 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19980 report_fatal_error(
19981 "llvm.x86.seh.recoverfp must take a function as the first argument");
19982 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19985 case Intrinsic::localaddress: {
19986 // Returns one of the stack, base, or frame pointer registers, depending on
19987 // which is used to reference local variables.
19988 MachineFunction &MF = DAG.getMachineFunction();
19989 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19991 if (RegInfo->hasBasePointer(MF))
19992 Reg = RegInfo->getBaseRegister();
19993 else // This function handles the SP or FP case.
19994 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19995 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20000 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20001 SDValue Src, SDValue Mask, SDValue Base,
20002 SDValue Index, SDValue ScaleOp, SDValue Chain,
20003 const X86Subtarget &Subtarget) {
20005 auto *C = cast<ConstantSDNode>(ScaleOp);
20006 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20007 EVT MaskVT = Mask.getValueType();
20008 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20009 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20010 SDValue Segment = DAG.getRegister(0, MVT::i32);
20011 // If source is undef or we know it won't be used, use a zero vector
20012 // to break register dependency.
20013 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20014 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20015 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20016 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20017 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20018 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20019 return DAG.getMergeValues(RetOps, dl);
20022 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20023 SDValue Src, SDValue Mask, SDValue Base,
20024 SDValue Index, SDValue ScaleOp, SDValue Chain,
20025 const X86Subtarget &Subtarget) {
20027 auto *C = cast<ConstantSDNode>(ScaleOp);
20028 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20029 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20030 Index.getSimpleValueType().getVectorNumElements());
20032 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20033 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20034 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20035 SDValue Segment = DAG.getRegister(0, MVT::i32);
20036 // If source is undef or we know it won't be used, use a zero vector
20037 // to break register dependency.
20038 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20039 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20040 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20041 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20042 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20043 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20044 return DAG.getMergeValues(RetOps, dl);
20047 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20048 SDValue Src, SDValue Mask, SDValue Base,
20049 SDValue Index, SDValue ScaleOp, SDValue Chain,
20050 const X86Subtarget &Subtarget) {
20052 auto *C = cast<ConstantSDNode>(ScaleOp);
20053 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20054 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20055 SDValue Segment = DAG.getRegister(0, MVT::i32);
20056 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20057 Index.getSimpleValueType().getVectorNumElements());
20059 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20060 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20061 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20062 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20063 return SDValue(Res, 1);
20066 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20067 SDValue Mask, SDValue Base, SDValue Index,
20068 SDValue ScaleOp, SDValue Chain,
20069 const X86Subtarget &Subtarget) {
20071 auto *C = cast<ConstantSDNode>(ScaleOp);
20072 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20073 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20074 SDValue Segment = DAG.getRegister(0, MVT::i32);
20076 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20077 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20078 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20079 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20080 return SDValue(Res, 0);
20083 /// Handles the lowering of builtin intrinsic that return the value
20084 /// of the extended control register.
20085 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20087 const X86Subtarget &Subtarget,
20088 SmallVectorImpl<SDValue> &Results) {
20089 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20090 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20093 // The ECX register is used to select the index of the XCR register to
20096 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20097 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20098 Chain = SDValue(N1, 0);
20100 // Reads the content of XCR and returns it in registers EDX:EAX.
20101 if (Subtarget.is64Bit()) {
20102 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20103 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20106 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20107 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20110 Chain = HI.getValue(1);
20112 if (Subtarget.is64Bit()) {
20113 // Merge the two 32-bit values into a 64-bit one..
20114 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20115 DAG.getConstant(32, DL, MVT::i8));
20116 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20117 Results.push_back(Chain);
20121 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20122 SDValue Ops[] = { LO, HI };
20123 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20124 Results.push_back(Pair);
20125 Results.push_back(Chain);
20128 /// Handles the lowering of builtin intrinsics that read performance monitor
20129 /// counters (x86_rdpmc).
20130 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20132 const X86Subtarget &Subtarget,
20133 SmallVectorImpl<SDValue> &Results) {
20134 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20135 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20138 // The ECX register is used to select the index of the performance counter
20140 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20142 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20144 // Reads the content of a 64-bit performance counter and returns it in the
20145 // registers EDX:EAX.
20146 if (Subtarget.is64Bit()) {
20147 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20148 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20151 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20152 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20155 Chain = HI.getValue(1);
20157 if (Subtarget.is64Bit()) {
20158 // The EAX register is loaded with the low-order 32 bits. The EDX register
20159 // is loaded with the supported high-order bits of the counter.
20160 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20161 DAG.getConstant(32, DL, MVT::i8));
20162 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20163 Results.push_back(Chain);
20167 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20168 SDValue Ops[] = { LO, HI };
20169 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20170 Results.push_back(Pair);
20171 Results.push_back(Chain);
20174 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20175 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20176 /// READCYCLECOUNTER nodes.
20177 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20179 const X86Subtarget &Subtarget,
20180 SmallVectorImpl<SDValue> &Results) {
20181 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20182 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20185 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20186 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20187 // and the EAX register is loaded with the low-order 32 bits.
20188 if (Subtarget.is64Bit()) {
20189 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20190 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20193 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20194 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20197 SDValue Chain = HI.getValue(1);
20199 if (Opcode == X86ISD::RDTSCP_DAG) {
20200 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20202 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20203 // the ECX register. Add 'ecx' explicitly to the chain.
20204 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20206 // Explicitly store the content of ECX at the location passed in input
20207 // to the 'rdtscp' intrinsic.
20208 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20209 MachinePointerInfo());
20212 if (Subtarget.is64Bit()) {
20213 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20214 // the EAX register is loaded with the low-order 32 bits.
20215 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20216 DAG.getConstant(32, DL, MVT::i8));
20217 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20218 Results.push_back(Chain);
20222 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20223 SDValue Ops[] = { LO, HI };
20224 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20225 Results.push_back(Pair);
20226 Results.push_back(Chain);
20229 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20230 SelectionDAG &DAG) {
20231 SmallVector<SDValue, 2> Results;
20233 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20235 return DAG.getMergeValues(Results, DL);
20238 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20239 MachineFunction &MF = DAG.getMachineFunction();
20240 SDValue Chain = Op.getOperand(0);
20241 SDValue RegNode = Op.getOperand(2);
20242 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20244 report_fatal_error("EH registrations only live in functions using WinEH");
20246 // Cast the operand to an alloca, and remember the frame index.
20247 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20249 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20250 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20252 // Return the chain operand without making any DAG nodes.
20256 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20257 MachineFunction &MF = DAG.getMachineFunction();
20258 SDValue Chain = Op.getOperand(0);
20259 SDValue EHGuard = Op.getOperand(2);
20260 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20262 report_fatal_error("EHGuard only live in functions using WinEH");
20264 // Cast the operand to an alloca, and remember the frame index.
20265 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20267 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20268 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20270 // Return the chain operand without making any DAG nodes.
20274 /// Emit Truncating Store with signed or unsigned saturation.
20276 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20277 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20278 SelectionDAG &DAG) {
20280 SDVTList VTs = DAG.getVTList(MVT::Other);
20281 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20282 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20284 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20285 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20288 /// Emit Masked Truncating Store with signed or unsigned saturation.
20290 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20291 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20292 MachineMemOperand *MMO, SelectionDAG &DAG) {
20294 SDVTList VTs = DAG.getVTList(MVT::Other);
20295 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20297 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20298 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20301 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20302 SelectionDAG &DAG) {
20303 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20305 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20308 case llvm::Intrinsic::x86_seh_ehregnode:
20309 return MarkEHRegistrationNode(Op, DAG);
20310 case llvm::Intrinsic::x86_seh_ehguard:
20311 return MarkEHGuard(Op, DAG);
20312 case llvm::Intrinsic::x86_flags_read_u32:
20313 case llvm::Intrinsic::x86_flags_read_u64:
20314 case llvm::Intrinsic::x86_flags_write_u32:
20315 case llvm::Intrinsic::x86_flags_write_u64: {
20316 // We need a frame pointer because this will get lowered to a PUSH/POP
20318 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20319 MFI.setHasCopyImplyingStackAdjustment(true);
20320 // Don't do anything here, we will expand these intrinsics out later
20321 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20324 case Intrinsic::x86_lwpins32:
20325 case Intrinsic::x86_lwpins64: {
20327 SDValue Chain = Op->getOperand(0);
20328 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20330 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20331 Op->getOperand(3), Op->getOperand(4));
20332 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20333 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20334 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20335 LwpIns.getValue(1));
20342 switch(IntrData->Type) {
20343 default: llvm_unreachable("Unknown Intrinsic Type");
20346 // Emit the node with the right value type.
20347 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20348 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20350 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20351 // Otherwise return the value from Rand, which is always 0, casted to i32.
20352 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20353 DAG.getConstant(1, dl, Op->getValueType(1)),
20354 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20355 SDValue(Result.getNode(), 1) };
20356 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20357 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20360 // Return { result, isValid, chain }.
20361 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20362 SDValue(Result.getNode(), 2));
20364 case GATHER_AVX2: {
20365 SDValue Chain = Op.getOperand(0);
20366 SDValue Src = Op.getOperand(2);
20367 SDValue Base = Op.getOperand(3);
20368 SDValue Index = Op.getOperand(4);
20369 SDValue Mask = Op.getOperand(5);
20370 SDValue Scale = Op.getOperand(6);
20371 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20372 Scale, Chain, Subtarget);
20375 //gather(v1, mask, index, base, scale);
20376 SDValue Chain = Op.getOperand(0);
20377 SDValue Src = Op.getOperand(2);
20378 SDValue Base = Op.getOperand(3);
20379 SDValue Index = Op.getOperand(4);
20380 SDValue Mask = Op.getOperand(5);
20381 SDValue Scale = Op.getOperand(6);
20382 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20386 //scatter(base, mask, index, v1, scale);
20387 SDValue Chain = Op.getOperand(0);
20388 SDValue Base = Op.getOperand(2);
20389 SDValue Mask = Op.getOperand(3);
20390 SDValue Index = Op.getOperand(4);
20391 SDValue Src = Op.getOperand(5);
20392 SDValue Scale = Op.getOperand(6);
20393 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20394 Scale, Chain, Subtarget);
20397 SDValue Hint = Op.getOperand(6);
20398 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20399 assert((HintVal == 2 || HintVal == 3) &&
20400 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20401 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20402 SDValue Chain = Op.getOperand(0);
20403 SDValue Mask = Op.getOperand(2);
20404 SDValue Index = Op.getOperand(3);
20405 SDValue Base = Op.getOperand(4);
20406 SDValue Scale = Op.getOperand(5);
20407 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20410 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20412 SmallVector<SDValue, 2> Results;
20413 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20415 return DAG.getMergeValues(Results, dl);
20417 // Read Performance Monitoring Counters.
20419 SmallVector<SDValue, 2> Results;
20420 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20421 return DAG.getMergeValues(Results, dl);
20423 // Get Extended Control Register.
20425 SmallVector<SDValue, 2> Results;
20426 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20427 return DAG.getMergeValues(Results, dl);
20429 // XTEST intrinsics.
20431 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20432 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20434 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20435 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20436 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20437 Ret, SDValue(InTrans.getNode(), 1));
20441 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20442 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20443 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20444 DAG.getConstant(-1, dl, MVT::i8));
20445 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20446 Op.getOperand(4), GenCF.getValue(1));
20447 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20448 Op.getOperand(5), MachinePointerInfo());
20449 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20450 SDValue Results[] = { SetCC, Store };
20451 return DAG.getMergeValues(Results, dl);
20453 case COMPRESS_TO_MEM: {
20454 SDValue Mask = Op.getOperand(4);
20455 SDValue DataToCompress = Op.getOperand(3);
20456 SDValue Addr = Op.getOperand(2);
20457 SDValue Chain = Op.getOperand(0);
20458 MVT VT = DataToCompress.getSimpleValueType();
20460 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20461 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20463 if (isAllOnesConstant(Mask)) // return just a store
20464 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20465 MemIntr->getMemOperand());
20467 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20468 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20470 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20471 MemIntr->getMemOperand(),
20472 false /* truncating */, true /* compressing */);
20474 case TRUNCATE_TO_MEM_VI8:
20475 case TRUNCATE_TO_MEM_VI16:
20476 case TRUNCATE_TO_MEM_VI32: {
20477 SDValue Mask = Op.getOperand(4);
20478 SDValue DataToTruncate = Op.getOperand(3);
20479 SDValue Addr = Op.getOperand(2);
20480 SDValue Chain = Op.getOperand(0);
20482 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20483 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20485 EVT MemVT = MemIntr->getMemoryVT();
20487 uint16_t TruncationOp = IntrData->Opc0;
20488 switch (TruncationOp) {
20489 case X86ISD::VTRUNC: {
20490 if (isAllOnesConstant(Mask)) // return just a truncate store
20491 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20492 MemIntr->getMemOperand());
20494 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20495 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20497 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20498 MemIntr->getMemOperand(), true /* truncating */);
20500 case X86ISD::VTRUNCUS:
20501 case X86ISD::VTRUNCS: {
20502 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20503 if (isAllOnesConstant(Mask))
20504 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20505 MemIntr->getMemOperand(), DAG);
20507 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20508 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20510 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20511 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20514 llvm_unreachable("Unsupported truncstore intrinsic");
20518 case EXPAND_FROM_MEM: {
20519 SDValue Mask = Op.getOperand(4);
20520 SDValue PassThru = Op.getOperand(3);
20521 SDValue Addr = Op.getOperand(2);
20522 SDValue Chain = Op.getOperand(0);
20523 MVT VT = Op.getSimpleValueType();
20525 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20526 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20528 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20529 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20530 if (X86::isZeroNode(Mask))
20531 return DAG.getUNDEF(VT);
20533 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20534 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20535 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20536 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20537 true /* expanding */);
20542 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20543 SelectionDAG &DAG) const {
20544 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20545 MFI.setReturnAddressIsTaken(true);
20547 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20550 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20552 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20555 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20556 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20557 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20558 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20559 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20560 MachinePointerInfo());
20563 // Just load the return address.
20564 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20565 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20566 MachinePointerInfo());
20569 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20570 SelectionDAG &DAG) const {
20571 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20572 return getReturnAddressFrameIndex(DAG);
20575 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20576 MachineFunction &MF = DAG.getMachineFunction();
20577 MachineFrameInfo &MFI = MF.getFrameInfo();
20578 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20579 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20580 EVT VT = Op.getValueType();
20582 MFI.setFrameAddressIsTaken(true);
20584 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20585 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20586 // is not possible to crawl up the stack without looking at the unwind codes
20588 int FrameAddrIndex = FuncInfo->getFAIndex();
20589 if (!FrameAddrIndex) {
20590 // Set up a frame object for the return address.
20591 unsigned SlotSize = RegInfo->getSlotSize();
20592 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20593 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20594 FuncInfo->setFAIndex(FrameAddrIndex);
20596 return DAG.getFrameIndex(FrameAddrIndex, VT);
20599 unsigned FrameReg =
20600 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20601 SDLoc dl(Op); // FIXME probably not meaningful
20602 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20603 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20604 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20605 "Invalid Frame Register!");
20606 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20608 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20609 MachinePointerInfo());
20613 // FIXME? Maybe this could be a TableGen attribute on some registers and
20614 // this table could be generated automatically from RegInfo.
20615 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20616 SelectionDAG &DAG) const {
20617 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20618 const MachineFunction &MF = DAG.getMachineFunction();
20620 unsigned Reg = StringSwitch<unsigned>(RegName)
20621 .Case("esp", X86::ESP)
20622 .Case("rsp", X86::RSP)
20623 .Case("ebp", X86::EBP)
20624 .Case("rbp", X86::RBP)
20627 if (Reg == X86::EBP || Reg == X86::RBP) {
20628 if (!TFI.hasFP(MF))
20629 report_fatal_error("register " + StringRef(RegName) +
20630 " is allocatable: function has no frame pointer");
20633 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20634 unsigned FrameReg =
20635 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20636 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20637 "Invalid Frame Register!");
20645 report_fatal_error("Invalid register name global variable");
20648 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20649 SelectionDAG &DAG) const {
20650 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20651 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20654 unsigned X86TargetLowering::getExceptionPointerRegister(
20655 const Constant *PersonalityFn) const {
20656 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20657 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20659 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20662 unsigned X86TargetLowering::getExceptionSelectorRegister(
20663 const Constant *PersonalityFn) const {
20664 // Funclet personalities don't use selectors (the runtime does the selection).
20665 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20666 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20669 bool X86TargetLowering::needsFixedCatchObjects() const {
20670 return Subtarget.isTargetWin64();
20673 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20674 SDValue Chain = Op.getOperand(0);
20675 SDValue Offset = Op.getOperand(1);
20676 SDValue Handler = Op.getOperand(2);
20679 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20680 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20681 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20682 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20683 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20684 "Invalid Frame Register!");
20685 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20686 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20688 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20689 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20691 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20692 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20693 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20695 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20696 DAG.getRegister(StoreAddrReg, PtrVT));
20699 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20700 SelectionDAG &DAG) const {
20702 // If the subtarget is not 64bit, we may need the global base reg
20703 // after isel expand pseudo, i.e., after CGBR pass ran.
20704 // Therefore, ask for the GlobalBaseReg now, so that the pass
20705 // inserts the code for us in case we need it.
20706 // Otherwise, we will end up in a situation where we will
20707 // reference a virtual register that is not defined!
20708 if (!Subtarget.is64Bit()) {
20709 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20710 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20712 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20713 DAG.getVTList(MVT::i32, MVT::Other),
20714 Op.getOperand(0), Op.getOperand(1));
20717 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20718 SelectionDAG &DAG) const {
20720 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20721 Op.getOperand(0), Op.getOperand(1));
20724 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20725 SelectionDAG &DAG) const {
20727 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20731 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20732 return Op.getOperand(0);
20735 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20736 SelectionDAG &DAG) const {
20737 SDValue Root = Op.getOperand(0);
20738 SDValue Trmp = Op.getOperand(1); // trampoline
20739 SDValue FPtr = Op.getOperand(2); // nested function
20740 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20743 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20744 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20746 if (Subtarget.is64Bit()) {
20747 SDValue OutChains[6];
20749 // Large code-model.
20750 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20751 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20753 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20754 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20756 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20758 // Load the pointer to the nested function into R11.
20759 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20760 SDValue Addr = Trmp;
20761 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20762 Addr, MachinePointerInfo(TrmpAddr));
20764 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20765 DAG.getConstant(2, dl, MVT::i64));
20767 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20768 /* Alignment = */ 2);
20770 // Load the 'nest' parameter value into R10.
20771 // R10 is specified in X86CallingConv.td
20772 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20773 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20774 DAG.getConstant(10, dl, MVT::i64));
20775 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20776 Addr, MachinePointerInfo(TrmpAddr, 10));
20778 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20779 DAG.getConstant(12, dl, MVT::i64));
20781 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20782 /* Alignment = */ 2);
20784 // Jump to the nested function.
20785 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20786 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20787 DAG.getConstant(20, dl, MVT::i64));
20788 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20789 Addr, MachinePointerInfo(TrmpAddr, 20));
20791 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20792 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20793 DAG.getConstant(22, dl, MVT::i64));
20794 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20795 Addr, MachinePointerInfo(TrmpAddr, 22));
20797 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20799 const Function *Func =
20800 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20801 CallingConv::ID CC = Func->getCallingConv();
20806 llvm_unreachable("Unsupported calling convention");
20807 case CallingConv::C:
20808 case CallingConv::X86_StdCall: {
20809 // Pass 'nest' parameter in ECX.
20810 // Must be kept in sync with X86CallingConv.td
20811 NestReg = X86::ECX;
20813 // Check that ECX wasn't needed by an 'inreg' parameter.
20814 FunctionType *FTy = Func->getFunctionType();
20815 const AttributeList &Attrs = Func->getAttributes();
20817 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20818 unsigned InRegCount = 0;
20821 for (FunctionType::param_iterator I = FTy->param_begin(),
20822 E = FTy->param_end(); I != E; ++I, ++Idx)
20823 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20824 auto &DL = DAG.getDataLayout();
20825 // FIXME: should only count parameters that are lowered to integers.
20826 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20829 if (InRegCount > 2) {
20830 report_fatal_error("Nest register in use - reduce number of inreg"
20836 case CallingConv::X86_FastCall:
20837 case CallingConv::X86_ThisCall:
20838 case CallingConv::Fast:
20839 // Pass 'nest' parameter in EAX.
20840 // Must be kept in sync with X86CallingConv.td
20841 NestReg = X86::EAX;
20845 SDValue OutChains[4];
20846 SDValue Addr, Disp;
20848 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20849 DAG.getConstant(10, dl, MVT::i32));
20850 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20852 // This is storing the opcode for MOV32ri.
20853 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20854 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20856 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20857 Trmp, MachinePointerInfo(TrmpAddr));
20859 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20860 DAG.getConstant(1, dl, MVT::i32));
20862 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20863 /* Alignment = */ 1);
20865 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20866 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20867 DAG.getConstant(5, dl, MVT::i32));
20868 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20869 Addr, MachinePointerInfo(TrmpAddr, 5),
20870 /* Alignment = */ 1);
20872 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20873 DAG.getConstant(6, dl, MVT::i32));
20875 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20876 /* Alignment = */ 1);
20878 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20882 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20883 SelectionDAG &DAG) const {
20885 The rounding mode is in bits 11:10 of FPSR, and has the following
20887 00 Round to nearest
20892 FLT_ROUNDS, on the other hand, expects the following:
20899 To perform the conversion, we do:
20900 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20903 MachineFunction &MF = DAG.getMachineFunction();
20904 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20905 unsigned StackAlignment = TFI.getStackAlignment();
20906 MVT VT = Op.getSimpleValueType();
20909 // Save FP Control Word to stack slot
20910 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20911 SDValue StackSlot =
20912 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20914 MachineMemOperand *MMO =
20915 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20916 MachineMemOperand::MOStore, 2, 2);
20918 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20919 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20920 DAG.getVTList(MVT::Other),
20921 Ops, MVT::i16, MMO);
20923 // Load FP Control Word from stack slot
20925 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20927 // Transform as necessary
20929 DAG.getNode(ISD::SRL, DL, MVT::i16,
20930 DAG.getNode(ISD::AND, DL, MVT::i16,
20931 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20932 DAG.getConstant(11, DL, MVT::i8));
20934 DAG.getNode(ISD::SRL, DL, MVT::i16,
20935 DAG.getNode(ISD::AND, DL, MVT::i16,
20936 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20937 DAG.getConstant(9, DL, MVT::i8));
20940 DAG.getNode(ISD::AND, DL, MVT::i16,
20941 DAG.getNode(ISD::ADD, DL, MVT::i16,
20942 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20943 DAG.getConstant(1, DL, MVT::i16)),
20944 DAG.getConstant(3, DL, MVT::i16));
20946 return DAG.getNode((VT.getSizeInBits() < 16 ?
20947 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20950 // Split an unary integer op into 2 half sized ops.
20951 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
20952 MVT VT = Op.getSimpleValueType();
20953 unsigned NumElems = VT.getVectorNumElements();
20954 unsigned SizeInBits = VT.getSizeInBits();
20956 // Extract the Lo/Hi vectors
20958 SDValue Src = Op.getOperand(0);
20959 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
20960 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
20962 MVT EltVT = VT.getVectorElementType();
20963 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
20964 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20965 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
20966 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
20969 // Decompose 256-bit ops into smaller 128-bit ops.
20970 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
20971 assert(Op.getSimpleValueType().is256BitVector() &&
20972 Op.getSimpleValueType().isInteger() &&
20973 "Only handle AVX 256-bit vector integer operation");
20974 return LowerVectorIntUnary(Op, DAG);
20977 // Decompose 512-bit ops into smaller 256-bit ops.
20978 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
20979 assert(Op.getSimpleValueType().is512BitVector() &&
20980 Op.getSimpleValueType().isInteger() &&
20981 "Only handle AVX 512-bit vector integer operation");
20982 return LowerVectorIntUnary(Op, DAG);
20985 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20987 // i8/i16 vector implemented using dword LZCNT vector instruction
20988 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20989 // split the vector, perform operation on it's Lo a Hi part and
20990 // concatenate the results.
20991 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
20992 assert(Op.getOpcode() == ISD::CTLZ);
20994 MVT VT = Op.getSimpleValueType();
20995 MVT EltVT = VT.getVectorElementType();
20996 unsigned NumElems = VT.getVectorNumElements();
20998 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20999 "Unsupported element type");
21001 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21003 return LowerVectorIntUnary(Op, DAG);
21005 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21006 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21007 "Unsupported value type for operation");
21009 // Use native supported vector instruction vplzcntd.
21010 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21011 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21012 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21013 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21015 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21018 // Lower CTLZ using a PSHUFB lookup table implementation.
21019 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21020 const X86Subtarget &Subtarget,
21021 SelectionDAG &DAG) {
21022 MVT VT = Op.getSimpleValueType();
21023 int NumElts = VT.getVectorNumElements();
21024 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21025 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21027 // Per-nibble leading zero PSHUFB lookup table.
21028 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21029 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21030 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21031 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21033 SmallVector<SDValue, 64> LUTVec;
21034 for (int i = 0; i < NumBytes; ++i)
21035 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21036 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21038 // Begin by bitcasting the input to byte vector, then split those bytes
21039 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21040 // If the hi input nibble is zero then we add both results together, otherwise
21041 // we just take the hi result (by masking the lo result to zero before the
21043 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21044 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21046 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21047 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21048 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21049 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21050 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21052 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21053 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21054 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21055 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21057 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21058 // of the current vector width in the same way we did for the nibbles.
21059 // If the upper half of the input element is zero then add the halves'
21060 // leading zero counts together, otherwise just use the upper half's.
21061 // Double the width of the result until we are at target width.
21062 while (CurrVT != VT) {
21063 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21064 int CurrNumElts = CurrVT.getVectorNumElements();
21065 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21066 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21067 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21069 // Check if the upper half of the input element is zero.
21070 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21071 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21072 HiZ = DAG.getBitcast(NextVT, HiZ);
21074 // Move the upper/lower halves to the lower bits as we'll be extending to
21075 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21077 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21078 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21079 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21080 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21081 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21088 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21089 const X86Subtarget &Subtarget,
21090 SelectionDAG &DAG) {
21091 MVT VT = Op.getSimpleValueType();
21093 if (Subtarget.hasCDI())
21094 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21096 // Decompose 256-bit ops into smaller 128-bit ops.
21097 if (VT.is256BitVector() && !Subtarget.hasInt256())
21098 return Lower256IntUnary(Op, DAG);
21100 // Decompose 512-bit ops into smaller 256-bit ops.
21101 if (VT.is512BitVector() && !Subtarget.hasBWI())
21102 return Lower512IntUnary(Op, DAG);
21104 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21105 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21108 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21109 SelectionDAG &DAG) {
21110 MVT VT = Op.getSimpleValueType();
21112 unsigned NumBits = VT.getSizeInBits();
21114 unsigned Opc = Op.getOpcode();
21117 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21119 Op = Op.getOperand(0);
21120 if (VT == MVT::i8) {
21121 // Zero extend to i32 since there is not an i8 bsr.
21123 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21126 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21127 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21128 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21130 if (Opc == ISD::CTLZ) {
21131 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21134 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21135 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21138 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21141 // Finally xor with NumBits-1.
21142 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21143 DAG.getConstant(NumBits - 1, dl, OpVT));
21146 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21150 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21151 MVT VT = Op.getSimpleValueType();
21152 unsigned NumBits = VT.getScalarSizeInBits();
21155 if (VT.isVector()) {
21156 SDValue N0 = Op.getOperand(0);
21157 SDValue Zero = DAG.getConstant(0, dl, VT);
21159 // lsb(x) = (x & -x)
21160 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21161 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21163 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21164 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21165 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21166 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21167 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21170 // cttz(x) = ctpop(lsb - 1)
21171 SDValue One = DAG.getConstant(1, dl, VT);
21172 return DAG.getNode(ISD::CTPOP, dl, VT,
21173 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21176 assert(Op.getOpcode() == ISD::CTTZ &&
21177 "Only scalar CTTZ requires custom lowering");
21179 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21180 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21181 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21183 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21186 DAG.getConstant(NumBits, dl, VT),
21187 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21190 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21193 /// Break a 256-bit integer operation into two new 128-bit ones and then
21194 /// concatenate the result back.
21195 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21196 MVT VT = Op.getSimpleValueType();
21198 assert(VT.is256BitVector() && VT.isInteger() &&
21199 "Unsupported value type for operation");
21201 unsigned NumElems = VT.getVectorNumElements();
21204 // Extract the LHS vectors
21205 SDValue LHS = Op.getOperand(0);
21206 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21207 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21209 // Extract the RHS vectors
21210 SDValue RHS = Op.getOperand(1);
21211 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21212 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21214 MVT EltVT = VT.getVectorElementType();
21215 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21217 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21218 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21219 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21222 /// Break a 512-bit integer operation into two new 256-bit ones and then
21223 /// concatenate the result back.
21224 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21225 MVT VT = Op.getSimpleValueType();
21227 assert(VT.is512BitVector() && VT.isInteger() &&
21228 "Unsupported value type for operation");
21230 unsigned NumElems = VT.getVectorNumElements();
21233 // Extract the LHS vectors
21234 SDValue LHS = Op.getOperand(0);
21235 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21236 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21238 // Extract the RHS vectors
21239 SDValue RHS = Op.getOperand(1);
21240 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21241 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21243 MVT EltVT = VT.getVectorElementType();
21244 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21246 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21247 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21248 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21251 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21252 MVT VT = Op.getSimpleValueType();
21253 if (VT.getScalarType() == MVT::i1)
21254 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21255 Op.getOperand(0), Op.getOperand(1));
21256 assert(Op.getSimpleValueType().is256BitVector() &&
21257 Op.getSimpleValueType().isInteger() &&
21258 "Only handle AVX 256-bit vector integer operation");
21259 return Lower256IntArith(Op, DAG);
21262 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21263 assert(Op.getSimpleValueType().is256BitVector() &&
21264 Op.getSimpleValueType().isInteger() &&
21265 "Only handle AVX 256-bit vector integer operation");
21266 return Lower256IntUnary(Op, DAG);
21269 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21270 assert(Op.getSimpleValueType().is256BitVector() &&
21271 Op.getSimpleValueType().isInteger() &&
21272 "Only handle AVX 256-bit vector integer operation");
21273 return Lower256IntArith(Op, DAG);
21276 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21277 SelectionDAG &DAG) {
21279 MVT VT = Op.getSimpleValueType();
21281 if (VT.getScalarType() == MVT::i1)
21282 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21284 // Decompose 256-bit ops into smaller 128-bit ops.
21285 if (VT.is256BitVector() && !Subtarget.hasInt256())
21286 return Lower256IntArith(Op, DAG);
21288 SDValue A = Op.getOperand(0);
21289 SDValue B = Op.getOperand(1);
21291 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21292 // vector pairs, multiply and truncate.
21293 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21294 if (Subtarget.hasInt256()) {
21295 // For 512-bit vectors, split into 256-bit vectors to allow the
21296 // sign-extension to occur.
21297 if (VT == MVT::v64i8)
21298 return Lower512IntArith(Op, DAG);
21300 // For 256-bit vectors, split into 128-bit vectors to allow the
21301 // sign-extension to occur. We don't need this on AVX512BW as we can
21302 // safely sign-extend to v32i16.
21303 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21304 return Lower256IntArith(Op, DAG);
21306 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21307 return DAG.getNode(
21308 ISD::TRUNCATE, dl, VT,
21309 DAG.getNode(ISD::MUL, dl, ExVT,
21310 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21311 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21314 assert(VT == MVT::v16i8 &&
21315 "Pre-AVX2 support only supports v16i8 multiplication");
21316 MVT ExVT = MVT::v8i16;
21318 // Extract the lo parts and sign extend to i16
21320 if (Subtarget.hasSSE41()) {
21321 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21322 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21324 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21325 -1, 4, -1, 5, -1, 6, -1, 7};
21326 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21327 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21328 ALo = DAG.getBitcast(ExVT, ALo);
21329 BLo = DAG.getBitcast(ExVT, BLo);
21330 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21331 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21334 // Extract the hi parts and sign extend to i16
21336 if (Subtarget.hasSSE41()) {
21337 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21338 -1, -1, -1, -1, -1, -1, -1, -1};
21339 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21340 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21341 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21342 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21344 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21345 -1, 12, -1, 13, -1, 14, -1, 15};
21346 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21347 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21348 AHi = DAG.getBitcast(ExVT, AHi);
21349 BHi = DAG.getBitcast(ExVT, BHi);
21350 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21351 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21354 // Multiply, mask the lower 8bits of the lo/hi results and pack
21355 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21356 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21357 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21358 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21359 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21362 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21363 if (VT == MVT::v4i32) {
21364 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21365 "Should not custom lower when pmuldq is available!");
21367 // Extract the odd parts.
21368 static const int UnpackMask[] = { 1, -1, 3, -1 };
21369 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21370 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21372 // Multiply the even parts.
21373 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21374 // Now multiply odd parts.
21375 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21377 Evens = DAG.getBitcast(VT, Evens);
21378 Odds = DAG.getBitcast(VT, Odds);
21380 // Merge the two vectors back together with a shuffle. This expands into 2
21382 static const int ShufMask[] = { 0, 4, 2, 6 };
21383 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21386 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21387 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21389 // 32-bit vector types used for MULDQ/MULUDQ.
21390 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21392 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21393 // 32-bits. We can lower with this if the sign bits stretch that far.
21394 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21395 DAG.ComputeNumSignBits(B) > 32) {
21396 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21397 DAG.getBitcast(MulVT, B));
21400 // Ahi = psrlqi(a, 32);
21401 // Bhi = psrlqi(b, 32);
21403 // AloBlo = pmuludq(a, b);
21404 // AloBhi = pmuludq(a, Bhi);
21405 // AhiBlo = pmuludq(Ahi, b);
21407 // Hi = psllqi(AloBhi + AhiBlo, 32);
21408 // return AloBlo + Hi;
21409 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21410 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21411 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21413 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21414 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21415 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21417 // Bit cast to 32-bit vectors for MULUDQ.
21418 SDValue Alo = DAG.getBitcast(MulVT, A);
21419 SDValue Blo = DAG.getBitcast(MulVT, B);
21421 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21423 // Only multiply lo/hi halves that aren't known to be zero.
21424 SDValue AloBlo = Zero;
21425 if (!ALoIsZero && !BLoIsZero)
21426 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21428 SDValue AloBhi = Zero;
21429 if (!ALoIsZero && !BHiIsZero) {
21430 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21431 Bhi = DAG.getBitcast(MulVT, Bhi);
21432 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21435 SDValue AhiBlo = Zero;
21436 if (!AHiIsZero && !BLoIsZero) {
21437 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21438 Ahi = DAG.getBitcast(MulVT, Ahi);
21439 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21442 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21443 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21445 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21448 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21449 SelectionDAG &DAG) {
21451 MVT VT = Op.getSimpleValueType();
21453 // Decompose 256-bit ops into smaller 128-bit ops.
21454 if (VT.is256BitVector() && !Subtarget.hasInt256())
21455 return Lower256IntArith(Op, DAG);
21457 // Only i8 vectors should need custom lowering after this.
21458 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21459 "Unsupported vector type");
21461 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21462 // logical shift down the upper half and pack back to i8.
21463 SDValue A = Op.getOperand(0);
21464 SDValue B = Op.getOperand(1);
21466 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21467 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21468 unsigned Opcode = Op.getOpcode();
21469 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21470 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21472 // AVX2 implementations - extend xmm subvectors to ymm.
21473 if (Subtarget.hasInt256()) {
21474 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21475 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21477 if (VT == MVT::v32i8) {
21478 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21479 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21480 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21481 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21482 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21483 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21484 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21485 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21486 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21487 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21488 DAG.getConstant(8, dl, MVT::v16i16));
21489 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21490 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21491 DAG.getConstant(8, dl, MVT::v16i16));
21492 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21493 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21494 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21495 16, 17, 18, 19, 20, 21, 22, 23};
21496 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21497 24, 25, 26, 27, 28, 29, 30, 31};
21498 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21499 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21500 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21503 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21504 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21505 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21506 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21507 DAG.getConstant(8, dl, MVT::v16i16));
21508 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21509 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21510 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21513 assert(VT == MVT::v16i8 &&
21514 "Pre-AVX2 support only supports v16i8 multiplication");
21515 MVT ExVT = MVT::v8i16;
21517 // Extract the lo parts and zero/sign extend to i16.
21519 if (Subtarget.hasSSE41()) {
21520 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21521 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21523 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21524 -1, 4, -1, 5, -1, 6, -1, 7};
21525 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21526 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21527 ALo = DAG.getBitcast(ExVT, ALo);
21528 BLo = DAG.getBitcast(ExVT, BLo);
21529 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21530 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21533 // Extract the hi parts and zero/sign extend to i16.
21535 if (Subtarget.hasSSE41()) {
21536 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21537 -1, -1, -1, -1, -1, -1, -1, -1};
21538 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21539 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21540 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21541 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21543 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21544 -1, 12, -1, 13, -1, 14, -1, 15};
21545 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21546 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21547 AHi = DAG.getBitcast(ExVT, AHi);
21548 BHi = DAG.getBitcast(ExVT, BHi);
21549 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21550 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21553 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21554 // pack back to v16i8.
21555 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21556 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21557 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21558 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21559 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21562 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21563 assert(Subtarget.isTargetWin64() && "Unexpected target");
21564 EVT VT = Op.getValueType();
21565 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21566 "Unexpected return type for lowering");
21570 switch (Op->getOpcode()) {
21571 default: llvm_unreachable("Unexpected request for libcall!");
21572 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21573 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21574 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21575 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21576 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21577 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21581 SDValue InChain = DAG.getEntryNode();
21583 TargetLowering::ArgListTy Args;
21584 TargetLowering::ArgListEntry Entry;
21585 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21586 EVT ArgVT = Op->getOperand(i).getValueType();
21587 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21588 "Unexpected argument type for lowering");
21589 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21590 Entry.Node = StackPtr;
21591 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21592 MachinePointerInfo(), /* Alignment = */ 16);
21593 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21594 Entry.Ty = PointerType::get(ArgTy,0);
21595 Entry.IsSExt = false;
21596 Entry.IsZExt = false;
21597 Args.push_back(Entry);
21600 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21601 getPointerTy(DAG.getDataLayout()));
21603 TargetLowering::CallLoweringInfo CLI(DAG);
21604 CLI.setDebugLoc(dl)
21607 getLibcallCallingConv(LC),
21608 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21611 .setSExtResult(isSigned)
21612 .setZExtResult(!isSigned);
21614 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21615 return DAG.getBitcast(VT, CallInfo.first);
21618 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21619 SelectionDAG &DAG) {
21620 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21621 MVT VT = Op0.getSimpleValueType();
21624 // Decompose 256-bit ops into smaller 128-bit ops.
21625 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21626 unsigned Opcode = Op.getOpcode();
21627 unsigned NumElems = VT.getVectorNumElements();
21628 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21629 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21630 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21631 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21632 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21633 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21634 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21636 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21637 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21639 return DAG.getMergeValues(Ops, dl);
21642 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21643 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21645 // PMULxD operations multiply each even value (starting at 0) of LHS with
21646 // the related value of RHS and produce a widen result.
21647 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21648 // => <2 x i64> <ae|cg>
21650 // In other word, to have all the results, we need to perform two PMULxD:
21651 // 1. one with the even values.
21652 // 2. one with the odd values.
21653 // To achieve #2, with need to place the odd values at an even position.
21655 // Place the odd value at an even position (basically, shift all values 1
21656 // step to the left):
21657 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21658 // <a|b|c|d> => <b|undef|d|undef>
21659 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21660 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21661 // <e|f|g|h> => <f|undef|h|undef>
21662 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21663 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21665 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21667 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21668 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21670 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21671 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21672 // => <2 x i64> <ae|cg>
21673 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21674 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21675 // => <2 x i64> <bf|dh>
21676 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21678 // Shuffle it back into the right order.
21679 SDValue Highs, Lows;
21680 if (VT == MVT::v8i32) {
21681 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21682 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21683 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21684 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21686 const int HighMask[] = {1, 5, 3, 7};
21687 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21688 const int LowMask[] = {0, 4, 2, 6};
21689 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21692 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21693 // unsigned multiply.
21694 if (IsSigned && !Subtarget.hasSSE41()) {
21695 SDValue ShAmt = DAG.getConstant(
21697 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21698 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21699 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21700 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21701 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21703 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21704 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21707 // The first result of MUL_LOHI is actually the low value, followed by the
21709 SDValue Ops[] = {Lows, Highs};
21710 return DAG.getMergeValues(Ops, dl);
21713 // Return true if the required (according to Opcode) shift-imm form is natively
21714 // supported by the Subtarget
21715 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21717 if (VT.getScalarSizeInBits() < 16)
21720 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21721 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21724 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21725 (VT.is256BitVector() && Subtarget.hasInt256());
21727 bool AShift = LShift && (Subtarget.hasAVX512() ||
21728 (VT != MVT::v2i64 && VT != MVT::v4i64));
21729 return (Opcode == ISD::SRA) ? AShift : LShift;
21732 // The shift amount is a variable, but it is the same for all vector lanes.
21733 // These instructions are defined together with shift-immediate.
21735 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21737 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21740 // Return true if the required (according to Opcode) variable-shift form is
21741 // natively supported by the Subtarget
21742 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21745 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21748 // vXi16 supported only on AVX-512, BWI
21749 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21752 if (Subtarget.hasAVX512())
21755 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21756 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21757 return (Opcode == ISD::SRA) ? AShift : LShift;
21760 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21761 const X86Subtarget &Subtarget) {
21762 MVT VT = Op.getSimpleValueType();
21764 SDValue R = Op.getOperand(0);
21765 SDValue Amt = Op.getOperand(1);
21767 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21768 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21770 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21771 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21772 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21773 SDValue Ex = DAG.getBitcast(ExVT, R);
21775 // ashr(R, 63) === cmp_slt(R, 0)
21776 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21777 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21778 "Unsupported PCMPGT op");
21779 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21780 getZeroVector(VT, Subtarget, DAG, dl), R);
21783 if (ShiftAmt >= 32) {
21784 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21786 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21787 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21788 ShiftAmt - 32, DAG);
21789 if (VT == MVT::v2i64)
21790 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21791 if (VT == MVT::v4i64)
21792 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21793 {9, 1, 11, 3, 13, 5, 15, 7});
21795 // SRA upper i32, SHL whole i64 and select lower i32.
21796 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21799 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21800 Lower = DAG.getBitcast(ExVT, Lower);
21801 if (VT == MVT::v2i64)
21802 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21803 if (VT == MVT::v4i64)
21804 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21805 {8, 1, 10, 3, 12, 5, 14, 7});
21807 return DAG.getBitcast(VT, Ex);
21810 // Optimize shl/srl/sra with constant shift amount.
21811 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21812 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21813 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21815 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21816 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21818 // i64 SRA needs to be performed as partial shifts.
21819 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21820 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21821 return ArithmeticShiftRight64(ShiftAmt);
21823 if (VT == MVT::v16i8 ||
21824 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21825 VT == MVT::v64i8) {
21826 unsigned NumElts = VT.getVectorNumElements();
21827 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21829 // Simple i8 add case
21830 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21831 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21833 // ashr(R, 7) === cmp_slt(R, 0)
21834 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21835 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21836 if (VT.is512BitVector()) {
21837 assert(VT == MVT::v64i8 && "Unexpected element type!");
21838 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21839 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21841 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21844 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21845 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21848 if (Op.getOpcode() == ISD::SHL) {
21849 // Make a large shift.
21850 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21852 SHL = DAG.getBitcast(VT, SHL);
21853 // Zero out the rightmost bits.
21854 return DAG.getNode(ISD::AND, dl, VT, SHL,
21855 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21857 if (Op.getOpcode() == ISD::SRL) {
21858 // Make a large shift.
21859 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21861 SRL = DAG.getBitcast(VT, SRL);
21862 // Zero out the leftmost bits.
21863 return DAG.getNode(ISD::AND, dl, VT, SRL,
21864 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21866 if (Op.getOpcode() == ISD::SRA) {
21867 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21868 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21870 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21871 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21872 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21875 llvm_unreachable("Unknown shift opcode.");
21880 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21881 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
21882 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21883 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21884 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21886 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
21887 unsigned SubVectorScale = 1;
21888 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21890 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
21891 Amt = Amt.getOperand(0);
21894 // Peek through any splat that was introduced for i64 shift vectorization.
21895 int SplatIndex = -1;
21896 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21897 if (SVN->isSplat()) {
21898 SplatIndex = SVN->getSplatIndex();
21899 Amt = Amt.getOperand(0);
21900 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21901 "Splat shuffle referencing second operand");
21904 if (Amt.getOpcode() != ISD::BITCAST ||
21905 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21908 Amt = Amt.getOperand(0);
21909 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21910 (SubVectorScale * VT.getVectorNumElements());
21911 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21912 uint64_t ShiftAmt = 0;
21913 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21914 for (unsigned i = 0; i != Ratio; ++i) {
21915 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21919 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21922 // Check remaining shift amounts (if not a splat).
21923 if (SplatIndex < 0) {
21924 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21925 uint64_t ShAmt = 0;
21926 for (unsigned j = 0; j != Ratio; ++j) {
21927 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21931 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21933 if (ShAmt != ShiftAmt)
21938 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21939 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21941 if (Op.getOpcode() == ISD::SRA)
21942 return ArithmeticShiftRight64(ShiftAmt);
21948 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21949 const X86Subtarget &Subtarget) {
21950 MVT VT = Op.getSimpleValueType();
21952 SDValue R = Op.getOperand(0);
21953 SDValue Amt = Op.getOperand(1);
21955 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21956 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21958 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21959 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21961 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21963 MVT EltVT = VT.getVectorElementType();
21965 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21966 // Check if this build_vector node is doing a splat.
21967 // If so, then set BaseShAmt equal to the splat value.
21968 BaseShAmt = BV->getSplatValue();
21969 if (BaseShAmt && BaseShAmt.isUndef())
21970 BaseShAmt = SDValue();
21972 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21973 Amt = Amt.getOperand(0);
21975 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21976 if (SVN && SVN->isSplat()) {
21977 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21978 SDValue InVec = Amt.getOperand(0);
21979 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21980 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21981 "Unexpected shuffle index found!");
21982 BaseShAmt = InVec.getOperand(SplatIdx);
21983 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21984 if (ConstantSDNode *C =
21985 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21986 if (C->getZExtValue() == SplatIdx)
21987 BaseShAmt = InVec.getOperand(1);
21992 // Avoid introducing an extract element from a shuffle.
21993 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21994 DAG.getIntPtrConstant(SplatIdx, dl));
21998 if (BaseShAmt.getNode()) {
21999 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22000 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22001 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22002 else if (EltVT.bitsLT(MVT::i32))
22003 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22005 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22009 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22010 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22011 Amt.getOpcode() == ISD::BITCAST &&
22012 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22013 Amt = Amt.getOperand(0);
22014 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22015 VT.getVectorNumElements();
22016 std::vector<SDValue> Vals(Ratio);
22017 for (unsigned i = 0; i != Ratio; ++i)
22018 Vals[i] = Amt.getOperand(i);
22019 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22020 for (unsigned j = 0; j != Ratio; ++j)
22021 if (Vals[j] != Amt.getOperand(i + j))
22025 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22026 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22031 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22032 SelectionDAG &DAG) {
22033 MVT VT = Op.getSimpleValueType();
22035 SDValue R = Op.getOperand(0);
22036 SDValue Amt = Op.getOperand(1);
22037 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22039 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22040 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22042 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22045 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22048 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22051 // XOP has 128-bit variable logical/arithmetic shifts.
22052 // +ve/-ve Amt = shift left/right.
22053 if (Subtarget.hasXOP() &&
22054 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22055 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22056 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22057 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22058 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22060 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22061 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22062 if (Op.getOpcode() == ISD::SRA)
22063 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22066 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22067 // shifts per-lane and then shuffle the partial results back together.
22068 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22069 // Splat the shift amounts so the scalar shifts above will catch it.
22070 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22071 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22072 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22073 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22074 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22077 // i64 vector arithmetic shift can be emulated with the transform:
22078 // M = lshr(SIGN_MASK, Amt)
22079 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22080 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22081 Op.getOpcode() == ISD::SRA) {
22082 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22083 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22084 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22085 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22086 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22090 // If possible, lower this packed shift into a vector multiply instead of
22091 // expanding it into a sequence of scalar shifts.
22092 // Do this only if the vector shift count is a constant build_vector.
22093 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22094 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22095 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22096 SmallVector<SDValue, 8> Elts;
22097 MVT SVT = VT.getVectorElementType();
22098 unsigned SVTBits = SVT.getSizeInBits();
22099 APInt One(SVTBits, 1);
22100 unsigned NumElems = VT.getVectorNumElements();
22102 for (unsigned i=0; i !=NumElems; ++i) {
22103 SDValue Op = Amt->getOperand(i);
22104 if (Op->isUndef()) {
22105 Elts.push_back(Op);
22109 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22110 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22111 uint64_t ShAmt = C.getZExtValue();
22112 if (ShAmt >= SVTBits) {
22113 Elts.push_back(DAG.getUNDEF(SVT));
22116 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22118 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22119 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22122 // Lower SHL with variable shift amount.
22123 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22124 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22126 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22127 DAG.getConstant(0x3f800000U, dl, VT));
22128 Op = DAG.getBitcast(MVT::v4f32, Op);
22129 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22130 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22133 // If possible, lower this shift as a sequence of two shifts by
22134 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22136 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22138 // Could be rewritten as:
22139 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22141 // The advantage is that the two shifts from the example would be
22142 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22143 // the vector shift into four scalar shifts plus four pairs of vector
22145 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22146 unsigned TargetOpcode = X86ISD::MOVSS;
22147 bool CanBeSimplified;
22148 // The splat value for the first packed shift (the 'X' from the example).
22149 SDValue Amt1 = Amt->getOperand(0);
22150 // The splat value for the second packed shift (the 'Y' from the example).
22151 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22153 // See if it is possible to replace this node with a sequence of
22154 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22155 if (VT == MVT::v4i32) {
22156 // Check if it is legal to use a MOVSS.
22157 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22158 Amt2 == Amt->getOperand(3);
22159 if (!CanBeSimplified) {
22160 // Otherwise, check if we can still simplify this node using a MOVSD.
22161 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22162 Amt->getOperand(2) == Amt->getOperand(3);
22163 TargetOpcode = X86ISD::MOVSD;
22164 Amt2 = Amt->getOperand(2);
22167 // Do similar checks for the case where the machine value type
22169 CanBeSimplified = Amt1 == Amt->getOperand(1);
22170 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22171 CanBeSimplified = Amt2 == Amt->getOperand(i);
22173 if (!CanBeSimplified) {
22174 TargetOpcode = X86ISD::MOVSD;
22175 CanBeSimplified = true;
22176 Amt2 = Amt->getOperand(4);
22177 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22178 CanBeSimplified = Amt1 == Amt->getOperand(i);
22179 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22180 CanBeSimplified = Amt2 == Amt->getOperand(j);
22184 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22185 isa<ConstantSDNode>(Amt2)) {
22186 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22187 MVT CastVT = MVT::v4i32;
22189 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22190 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22192 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22193 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22194 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22195 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22196 if (TargetOpcode == X86ISD::MOVSD)
22197 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22198 BitCast2, {0, 1, 6, 7}));
22199 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22200 BitCast2, {0, 5, 6, 7}));
22204 // v4i32 Non Uniform Shifts.
22205 // If the shift amount is constant we can shift each lane using the SSE2
22206 // immediate shifts, else we need to zero-extend each lane to the lower i64
22207 // and shift using the SSE2 variable shifts.
22208 // The separate results can then be blended together.
22209 if (VT == MVT::v4i32) {
22210 unsigned Opc = Op.getOpcode();
22211 SDValue Amt0, Amt1, Amt2, Amt3;
22213 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22214 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22215 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22216 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22218 // ISD::SHL is handled above but we include it here for completeness.
22221 llvm_unreachable("Unknown target vector shift node");
22223 Opc = X86ISD::VSHL;
22226 Opc = X86ISD::VSRL;
22229 Opc = X86ISD::VSRA;
22232 // The SSE2 shifts use the lower i64 as the same shift amount for
22233 // all lanes and the upper i64 is ignored. These shuffle masks
22234 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22235 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22236 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22237 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22238 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22239 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22242 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22243 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22244 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22245 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22246 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22247 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22248 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22251 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22252 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22253 // make the existing SSE solution better.
22254 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22255 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22256 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22257 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22258 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22259 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22261 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22262 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22263 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22264 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22265 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22268 if (VT == MVT::v16i8 ||
22269 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22270 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22271 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22272 unsigned ShiftOpcode = Op->getOpcode();
22274 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22275 if (VT.is512BitVector()) {
22276 // On AVX512BW targets we make use of the fact that VSELECT lowers
22277 // to a masked blend which selects bytes based just on the sign bit
22278 // extracted to a mask.
22279 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22280 V0 = DAG.getBitcast(VT, V0);
22281 V1 = DAG.getBitcast(VT, V1);
22282 Sel = DAG.getBitcast(VT, Sel);
22283 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22284 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22285 } else if (Subtarget.hasSSE41()) {
22286 // On SSE41 targets we make use of the fact that VSELECT lowers
22287 // to PBLENDVB which selects bytes based just on the sign bit.
22288 V0 = DAG.getBitcast(VT, V0);
22289 V1 = DAG.getBitcast(VT, V1);
22290 Sel = DAG.getBitcast(VT, Sel);
22291 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22293 // On pre-SSE41 targets we test for the sign bit by comparing to
22294 // zero - a negative value will set all bits of the lanes to true
22295 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22296 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22297 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22298 return DAG.getSelect(dl, SelVT, C, V0, V1);
22301 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22302 // We can safely do this using i16 shifts as we're only interested in
22303 // the 3 lower bits of each byte.
22304 Amt = DAG.getBitcast(ExtVT, Amt);
22305 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22306 Amt = DAG.getBitcast(VT, Amt);
22308 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22309 // r = VSELECT(r, shift(r, 4), a);
22311 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22312 R = SignBitSelect(VT, Amt, M, R);
22315 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22317 // r = VSELECT(r, shift(r, 2), a);
22318 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22319 R = SignBitSelect(VT, Amt, M, R);
22322 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22324 // return VSELECT(r, shift(r, 1), a);
22325 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22326 R = SignBitSelect(VT, Amt, M, R);
22330 if (Op->getOpcode() == ISD::SRA) {
22331 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22332 // so we can correctly sign extend. We don't care what happens to the
22334 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22335 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22336 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22337 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22338 ALo = DAG.getBitcast(ExtVT, ALo);
22339 AHi = DAG.getBitcast(ExtVT, AHi);
22340 RLo = DAG.getBitcast(ExtVT, RLo);
22341 RHi = DAG.getBitcast(ExtVT, RHi);
22343 // r = VSELECT(r, shift(r, 4), a);
22344 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22345 DAG.getConstant(4, dl, ExtVT));
22346 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22347 DAG.getConstant(4, dl, ExtVT));
22348 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22349 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22352 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22353 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22355 // r = VSELECT(r, shift(r, 2), a);
22356 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22357 DAG.getConstant(2, dl, ExtVT));
22358 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22359 DAG.getConstant(2, dl, ExtVT));
22360 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22361 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22364 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22365 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22367 // r = VSELECT(r, shift(r, 1), a);
22368 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22369 DAG.getConstant(1, dl, ExtVT));
22370 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22371 DAG.getConstant(1, dl, ExtVT));
22372 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22373 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22375 // Logical shift the result back to the lower byte, leaving a zero upper
22377 // meaning that we can safely pack with PACKUSWB.
22379 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22381 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22382 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22386 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22387 MVT ExtVT = MVT::v8i32;
22388 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22389 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22390 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22391 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22392 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22393 ALo = DAG.getBitcast(ExtVT, ALo);
22394 AHi = DAG.getBitcast(ExtVT, AHi);
22395 RLo = DAG.getBitcast(ExtVT, RLo);
22396 RHi = DAG.getBitcast(ExtVT, RHi);
22397 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22398 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22399 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22400 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22401 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22404 if (VT == MVT::v8i16) {
22405 unsigned ShiftOpcode = Op->getOpcode();
22407 // If we have a constant shift amount, the non-SSE41 path is best as
22408 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22409 bool UseSSE41 = Subtarget.hasSSE41() &&
22410 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22412 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22413 // On SSE41 targets we make use of the fact that VSELECT lowers
22414 // to PBLENDVB which selects bytes based just on the sign bit.
22416 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22417 V0 = DAG.getBitcast(ExtVT, V0);
22418 V1 = DAG.getBitcast(ExtVT, V1);
22419 Sel = DAG.getBitcast(ExtVT, Sel);
22420 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22422 // On pre-SSE41 targets we splat the sign bit - a negative value will
22423 // set all bits of the lanes to true and VSELECT uses that in
22424 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22426 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22427 return DAG.getSelect(dl, VT, C, V0, V1);
22430 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22432 // On SSE41 targets we need to replicate the shift mask in both
22433 // bytes for PBLENDVB.
22436 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22437 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22439 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22442 // r = VSELECT(r, shift(r, 8), a);
22443 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22444 R = SignBitSelect(Amt, M, R);
22447 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22449 // r = VSELECT(r, shift(r, 4), a);
22450 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22451 R = SignBitSelect(Amt, M, R);
22454 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22456 // r = VSELECT(r, shift(r, 2), a);
22457 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22458 R = SignBitSelect(Amt, M, R);
22461 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22463 // return VSELECT(r, shift(r, 1), a);
22464 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22465 R = SignBitSelect(Amt, M, R);
22469 // Decompose 256-bit shifts into smaller 128-bit shifts.
22470 if (VT.is256BitVector())
22471 return Lower256IntArith(Op, DAG);
22476 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22477 SelectionDAG &DAG) {
22478 MVT VT = Op.getSimpleValueType();
22480 SDValue R = Op.getOperand(0);
22481 SDValue Amt = Op.getOperand(1);
22483 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22484 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22485 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22487 // XOP has 128-bit vector variable + immediate rotates.
22488 // +ve/-ve Amt = rotate left/right.
22490 // Split 256-bit integers.
22491 if (VT.is256BitVector())
22492 return Lower256IntArith(Op, DAG);
22494 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22496 // Attempt to rotate by immediate.
22497 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22498 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22499 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22500 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22501 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22502 DAG.getConstant(RotateAmt, DL, MVT::i8));
22506 // Use general rotate by variable (per-element).
22507 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22510 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22511 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22512 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22513 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22514 // has only one use.
22515 SDNode *N = Op.getNode();
22516 SDValue LHS = N->getOperand(0);
22517 SDValue RHS = N->getOperand(1);
22518 unsigned BaseOp = 0;
22519 X86::CondCode Cond;
22521 switch (Op.getOpcode()) {
22522 default: llvm_unreachable("Unknown ovf instruction!");
22524 // A subtract of one will be selected as a INC. Note that INC doesn't
22525 // set CF, so we can't do this for UADDO.
22526 if (isOneConstant(RHS)) {
22527 BaseOp = X86ISD::INC;
22528 Cond = X86::COND_O;
22531 BaseOp = X86ISD::ADD;
22532 Cond = X86::COND_O;
22535 BaseOp = X86ISD::ADD;
22536 Cond = X86::COND_B;
22539 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22540 // set CF, so we can't do this for USUBO.
22541 if (isOneConstant(RHS)) {
22542 BaseOp = X86ISD::DEC;
22543 Cond = X86::COND_O;
22546 BaseOp = X86ISD::SUB;
22547 Cond = X86::COND_O;
22550 BaseOp = X86ISD::SUB;
22551 Cond = X86::COND_B;
22554 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22555 Cond = X86::COND_O;
22557 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22558 if (N->getValueType(0) == MVT::i8) {
22559 BaseOp = X86ISD::UMUL8;
22560 Cond = X86::COND_O;
22563 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22565 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22567 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22569 if (N->getValueType(1) == MVT::i1)
22570 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22572 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22576 // Also sets EFLAGS.
22577 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22578 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22580 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22582 if (N->getValueType(1) == MVT::i1)
22583 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22585 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22588 /// Returns true if the operand type is exactly twice the native width, and
22589 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22590 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22591 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22592 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22593 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22596 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22597 else if (OpWidth == 128)
22598 return Subtarget.hasCmpxchg16b();
22603 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22604 return needsCmpXchgNb(SI->getValueOperand()->getType());
22607 // Note: this turns large loads into lock cmpxchg8b/16b.
22608 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22609 TargetLowering::AtomicExpansionKind
22610 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22611 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22612 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22613 : AtomicExpansionKind::None;
22616 TargetLowering::AtomicExpansionKind
22617 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22618 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22619 Type *MemType = AI->getType();
22621 // If the operand is too big, we must see if cmpxchg8/16b is available
22622 // and default to library calls otherwise.
22623 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22624 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22625 : AtomicExpansionKind::None;
22628 AtomicRMWInst::BinOp Op = AI->getOperation();
22631 llvm_unreachable("Unknown atomic operation");
22632 case AtomicRMWInst::Xchg:
22633 case AtomicRMWInst::Add:
22634 case AtomicRMWInst::Sub:
22635 // It's better to use xadd, xsub or xchg for these in all cases.
22636 return AtomicExpansionKind::None;
22637 case AtomicRMWInst::Or:
22638 case AtomicRMWInst::And:
22639 case AtomicRMWInst::Xor:
22640 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22641 // prefix to a normal instruction for these operations.
22642 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22643 : AtomicExpansionKind::None;
22644 case AtomicRMWInst::Nand:
22645 case AtomicRMWInst::Max:
22646 case AtomicRMWInst::Min:
22647 case AtomicRMWInst::UMax:
22648 case AtomicRMWInst::UMin:
22649 // These always require a non-trivial set of data operations on x86. We must
22650 // use a cmpxchg loop.
22651 return AtomicExpansionKind::CmpXChg;
22656 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22657 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22658 Type *MemType = AI->getType();
22659 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22660 // there is no benefit in turning such RMWs into loads, and it is actually
22661 // harmful as it introduces a mfence.
22662 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22665 auto Builder = IRBuilder<>(AI);
22666 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22667 auto SynchScope = AI->getSynchScope();
22668 // We must restrict the ordering to avoid generating loads with Release or
22669 // ReleaseAcquire orderings.
22670 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22671 auto Ptr = AI->getPointerOperand();
22673 // Before the load we need a fence. Here is an example lifted from
22674 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22677 // x.store(1, relaxed);
22678 // r1 = y.fetch_add(0, release);
22680 // y.fetch_add(42, acquire);
22681 // r2 = x.load(relaxed);
22682 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22683 // lowered to just a load without a fence. A mfence flushes the store buffer,
22684 // making the optimization clearly correct.
22685 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22686 // otherwise, we might be able to be more aggressive on relaxed idempotent
22687 // rmw. In practice, they do not look useful, so we don't try to be
22688 // especially clever.
22689 if (SynchScope == SingleThread)
22690 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22691 // the IR level, so we must wrap it in an intrinsic.
22694 if (!Subtarget.hasMFence())
22695 // FIXME: it might make sense to use a locked operation here but on a
22696 // different cache-line to prevent cache-line bouncing. In practice it
22697 // is probably a small win, and x86 processors without mfence are rare
22698 // enough that we do not bother.
22702 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22703 Builder.CreateCall(MFence, {});
22705 // Finally we can emit the atomic load.
22706 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22707 AI->getType()->getPrimitiveSizeInBits());
22708 Loaded->setAtomic(Order, SynchScope);
22709 AI->replaceAllUsesWith(Loaded);
22710 AI->eraseFromParent();
22714 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22715 SelectionDAG &DAG) {
22717 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22718 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22719 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22720 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22722 // The only fence that needs an instruction is a sequentially-consistent
22723 // cross-thread fence.
22724 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22725 FenceScope == CrossThread) {
22726 if (Subtarget.hasMFence())
22727 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22729 SDValue Chain = Op.getOperand(0);
22730 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22732 DAG.getRegister(X86::ESP, MVT::i32), // Base
22733 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22734 DAG.getRegister(0, MVT::i32), // Index
22735 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22736 DAG.getRegister(0, MVT::i32), // Segment.
22740 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22741 return SDValue(Res, 0);
22744 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22745 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22748 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22749 SelectionDAG &DAG) {
22750 MVT T = Op.getSimpleValueType();
22754 switch(T.SimpleTy) {
22755 default: llvm_unreachable("Invalid value type!");
22756 case MVT::i8: Reg = X86::AL; size = 1; break;
22757 case MVT::i16: Reg = X86::AX; size = 2; break;
22758 case MVT::i32: Reg = X86::EAX; size = 4; break;
22760 assert(Subtarget.is64Bit() && "Node not type legal!");
22761 Reg = X86::RAX; size = 8;
22764 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22765 Op.getOperand(2), SDValue());
22766 SDValue Ops[] = { cpIn.getValue(0),
22769 DAG.getTargetConstant(size, DL, MVT::i8),
22770 cpIn.getValue(1) };
22771 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22772 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22773 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22777 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22778 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22779 MVT::i32, cpOut.getValue(2));
22780 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22782 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22783 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22784 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22788 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22789 SelectionDAG &DAG) {
22790 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22791 MVT DstVT = Op.getSimpleValueType();
22793 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22794 SrcVT == MVT::i64) {
22795 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22796 if (DstVT != MVT::f64)
22797 // This conversion needs to be expanded.
22800 SDValue Op0 = Op->getOperand(0);
22801 SmallVector<SDValue, 16> Elts;
22805 if (SrcVT.isVector()) {
22806 NumElts = SrcVT.getVectorNumElements();
22807 SVT = SrcVT.getVectorElementType();
22809 // Widen the vector in input in the case of MVT::v2i32.
22810 // Example: from MVT::v2i32 to MVT::v4i32.
22811 for (unsigned i = 0, e = NumElts; i != e; ++i)
22812 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22813 DAG.getIntPtrConstant(i, dl)));
22815 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22816 "Unexpected source type in LowerBITCAST");
22817 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22818 DAG.getIntPtrConstant(0, dl)));
22819 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22820 DAG.getIntPtrConstant(1, dl)));
22824 // Explicitly mark the extra elements as Undef.
22825 Elts.append(NumElts, DAG.getUNDEF(SVT));
22827 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22828 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22829 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22830 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22831 DAG.getIntPtrConstant(0, dl));
22834 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22835 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22836 assert((DstVT == MVT::i64 ||
22837 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22838 "Unexpected custom BITCAST");
22839 // i64 <=> MMX conversions are Legal.
22840 if (SrcVT==MVT::i64 && DstVT.isVector())
22842 if (DstVT==MVT::i64 && SrcVT.isVector())
22844 // MMX <=> MMX conversions are Legal.
22845 if (SrcVT.isVector() && DstVT.isVector())
22847 // All other conversions need to be expanded.
22851 /// Compute the horizontal sum of bytes in V for the elements of VT.
22853 /// Requires V to be a byte vector and VT to be an integer vector type with
22854 /// wider elements than V's type. The width of the elements of VT determines
22855 /// how many bytes of V are summed horizontally to produce each element of the
22857 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22858 const X86Subtarget &Subtarget,
22859 SelectionDAG &DAG) {
22861 MVT ByteVecVT = V.getSimpleValueType();
22862 MVT EltVT = VT.getVectorElementType();
22863 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22864 "Expected value to have byte element type.");
22865 assert(EltVT != MVT::i8 &&
22866 "Horizontal byte sum only makes sense for wider elements!");
22867 unsigned VecSize = VT.getSizeInBits();
22868 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22870 // PSADBW instruction horizontally add all bytes and leave the result in i64
22871 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22872 if (EltVT == MVT::i64) {
22873 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22874 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22875 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22876 return DAG.getBitcast(VT, V);
22879 if (EltVT == MVT::i32) {
22880 // We unpack the low half and high half into i32s interleaved with zeros so
22881 // that we can use PSADBW to horizontally sum them. The most useful part of
22882 // this is that it lines up the results of two PSADBW instructions to be
22883 // two v2i64 vectors which concatenated are the 4 population counts. We can
22884 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22885 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22886 SDValue V32 = DAG.getBitcast(VT, V);
22887 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22888 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22890 // Do the horizontal sums into two v2i64s.
22891 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22892 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22893 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22894 DAG.getBitcast(ByteVecVT, Low), Zeros);
22895 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22896 DAG.getBitcast(ByteVecVT, High), Zeros);
22898 // Merge them together.
22899 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22900 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22901 DAG.getBitcast(ShortVecVT, Low),
22902 DAG.getBitcast(ShortVecVT, High));
22904 return DAG.getBitcast(VT, V);
22907 // The only element type left is i16.
22908 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22910 // To obtain pop count for each i16 element starting from the pop count for
22911 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22912 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22913 // directly supported.
22914 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22915 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22916 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22917 DAG.getBitcast(ByteVecVT, V));
22918 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22921 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22922 const X86Subtarget &Subtarget,
22923 SelectionDAG &DAG) {
22924 MVT VT = Op.getSimpleValueType();
22925 MVT EltVT = VT.getVectorElementType();
22926 unsigned VecSize = VT.getSizeInBits();
22928 // Implement a lookup table in register by using an algorithm based on:
22929 // http://wm.ite.pl/articles/sse-popcount.html
22931 // The general idea is that every lower byte nibble in the input vector is an
22932 // index into a in-register pre-computed pop count table. We then split up the
22933 // input vector in two new ones: (1) a vector with only the shifted-right
22934 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22935 // masked out higher ones) for each byte. PSHUFB is used separately with both
22936 // to index the in-register table. Next, both are added and the result is a
22937 // i8 vector where each element contains the pop count for input byte.
22939 // To obtain the pop count for elements != i8, we follow up with the same
22940 // approach and use additional tricks as described below.
22942 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22943 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22944 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22945 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22947 int NumByteElts = VecSize / 8;
22948 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22949 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22950 SmallVector<SDValue, 64> LUTVec;
22951 for (int i = 0; i < NumByteElts; ++i)
22952 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22953 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22954 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22957 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22958 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22961 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22963 // The input vector is used as the shuffle mask that index elements into the
22964 // LUT. After counting low and high nibbles, add the vector to obtain the
22965 // final pop count per i8 element.
22966 SDValue HighPopCnt =
22967 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22968 SDValue LowPopCnt =
22969 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22970 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22972 if (EltVT == MVT::i8)
22975 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22978 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22979 const X86Subtarget &Subtarget,
22980 SelectionDAG &DAG) {
22981 MVT VT = Op.getSimpleValueType();
22982 assert(VT.is128BitVector() &&
22983 "Only 128-bit vector bitmath lowering supported.");
22985 int VecSize = VT.getSizeInBits();
22986 MVT EltVT = VT.getVectorElementType();
22987 int Len = EltVT.getSizeInBits();
22989 // This is the vectorized version of the "best" algorithm from
22990 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22991 // with a minor tweak to use a series of adds + shifts instead of vector
22992 // multiplications. Implemented for all integer vector types. We only use
22993 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22994 // much faster, even faster than using native popcnt instructions.
22996 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22997 MVT VT = V.getSimpleValueType();
22998 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22999 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23001 auto GetMask = [&](SDValue V, APInt Mask) {
23002 MVT VT = V.getSimpleValueType();
23003 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23004 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23007 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23008 // x86, so set the SRL type to have elements at least i16 wide. This is
23009 // correct because all of our SRLs are followed immediately by a mask anyways
23010 // that handles any bits that sneak into the high bits of the byte elements.
23011 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23015 // v = v - ((v >> 1) & 0x55555555...)
23017 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23018 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23019 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23021 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23022 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23023 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23024 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23025 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23027 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23028 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23029 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23030 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23032 // At this point, V contains the byte-wise population count, and we are
23033 // merely doing a horizontal sum if necessary to get the wider element
23035 if (EltVT == MVT::i8)
23038 return LowerHorizontalByteSum(
23039 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23043 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23044 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23045 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23046 SelectionDAG &DAG) {
23047 MVT VT = Op.getSimpleValueType();
23048 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23049 "Unknown CTPOP type to handle");
23050 SDLoc DL(Op.getNode());
23051 SDValue Op0 = Op.getOperand(0);
23053 if (!Subtarget.hasSSSE3()) {
23054 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23055 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23056 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23059 // Decompose 256-bit ops into smaller 128-bit ops.
23060 if (VT.is256BitVector() && !Subtarget.hasInt256())
23061 return Lower256IntUnary(Op, DAG);
23063 // Decompose 512-bit ops into smaller 256-bit ops.
23064 if (VT.is512BitVector() && !Subtarget.hasBWI())
23065 return Lower512IntUnary(Op, DAG);
23067 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23070 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23071 SelectionDAG &DAG) {
23072 assert(Op.getSimpleValueType().isVector() &&
23073 "We only do custom lowering for vector population count.");
23074 return LowerVectorCTPOP(Op, Subtarget, DAG);
23077 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23078 MVT VT = Op.getSimpleValueType();
23079 SDValue In = Op.getOperand(0);
23082 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23083 // perform the BITREVERSE.
23084 if (!VT.isVector()) {
23085 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23086 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23087 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23088 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23089 DAG.getIntPtrConstant(0, DL));
23092 int NumElts = VT.getVectorNumElements();
23093 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23095 // Decompose 256-bit ops into smaller 128-bit ops.
23096 if (VT.is256BitVector())
23097 return Lower256IntUnary(Op, DAG);
23099 assert(VT.is128BitVector() &&
23100 "Only 128-bit vector bitreverse lowering supported.");
23102 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23103 // perform the BSWAP in the shuffle.
23104 // Its best to shuffle using the second operand as this will implicitly allow
23105 // memory folding for multiple vectors.
23106 SmallVector<SDValue, 16> MaskElts;
23107 for (int i = 0; i != NumElts; ++i) {
23108 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23109 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23110 int PermuteByte = SourceByte | (2 << 5);
23111 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23115 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23116 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23117 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23119 return DAG.getBitcast(VT, Res);
23122 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23123 SelectionDAG &DAG) {
23124 if (Subtarget.hasXOP())
23125 return LowerBITREVERSE_XOP(Op, DAG);
23127 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23129 MVT VT = Op.getSimpleValueType();
23130 SDValue In = Op.getOperand(0);
23133 unsigned NumElts = VT.getVectorNumElements();
23134 assert(VT.getScalarType() == MVT::i8 &&
23135 "Only byte vector BITREVERSE supported");
23137 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23138 if (VT.is256BitVector() && !Subtarget.hasInt256())
23139 return Lower256IntUnary(Op, DAG);
23141 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23142 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23143 // 0-15 value (moved to the other nibble).
23144 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23145 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23146 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23148 const int LoLUT[16] = {
23149 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23150 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23151 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23152 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23153 const int HiLUT[16] = {
23154 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23155 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23156 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23157 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23159 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23160 for (unsigned i = 0; i < NumElts; ++i) {
23161 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23162 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23165 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23166 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23167 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23168 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23169 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23172 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23173 unsigned NewOpc = 0;
23174 switch (N->getOpcode()) {
23175 case ISD::ATOMIC_LOAD_ADD:
23176 NewOpc = X86ISD::LADD;
23178 case ISD::ATOMIC_LOAD_SUB:
23179 NewOpc = X86ISD::LSUB;
23181 case ISD::ATOMIC_LOAD_OR:
23182 NewOpc = X86ISD::LOR;
23184 case ISD::ATOMIC_LOAD_XOR:
23185 NewOpc = X86ISD::LXOR;
23187 case ISD::ATOMIC_LOAD_AND:
23188 NewOpc = X86ISD::LAND;
23191 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23194 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23195 return DAG.getMemIntrinsicNode(
23196 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23197 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23198 /*MemVT=*/N->getSimpleValueType(0), MMO);
23201 /// Lower atomic_load_ops into LOCK-prefixed operations.
23202 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23203 const X86Subtarget &Subtarget) {
23204 SDValue Chain = N->getOperand(0);
23205 SDValue LHS = N->getOperand(1);
23206 SDValue RHS = N->getOperand(2);
23207 unsigned Opc = N->getOpcode();
23208 MVT VT = N->getSimpleValueType(0);
23211 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23212 // can only be lowered when the result is unused. They should have already
23213 // been transformed into a cmpxchg loop in AtomicExpand.
23214 if (N->hasAnyUseOfValue(0)) {
23215 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23216 // select LXADD if LOCK_SUB can't be selected.
23217 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23218 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23219 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23220 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23221 RHS, AN->getMemOperand());
23223 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23224 "Used AtomicRMW ops other than Add should have been expanded!");
23228 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23229 // RAUW the chain, but don't worry about the result, as it's unused.
23230 assert(!N->hasAnyUseOfValue(0));
23231 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23235 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23236 SDNode *Node = Op.getNode();
23238 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23240 // Convert seq_cst store -> xchg
23241 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23242 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23243 // (The only way to get a 16-byte store is cmpxchg16b)
23244 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23245 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23246 AtomicOrdering::SequentiallyConsistent ||
23247 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23248 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23249 cast<AtomicSDNode>(Node)->getMemoryVT(),
23250 Node->getOperand(0),
23251 Node->getOperand(1), Node->getOperand(2),
23252 cast<AtomicSDNode>(Node)->getMemOperand());
23253 return Swap.getValue(1);
23255 // Other atomic stores have a simple pattern.
23259 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
23260 MVT VT = Op.getNode()->getSimpleValueType(0);
23262 // Let legalize expand this if it isn't a legal type yet.
23263 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23266 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23269 bool ExtraOp = false;
23270 switch (Op.getOpcode()) {
23271 default: llvm_unreachable("Invalid code");
23272 case ISD::ADDC: Opc = X86ISD::ADD; break;
23273 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
23274 case ISD::SUBC: Opc = X86ISD::SUB; break;
23275 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
23279 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23281 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23282 Op.getOperand(1), Op.getOperand(2));
23285 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23286 SDNode *N = Op.getNode();
23287 MVT VT = N->getSimpleValueType(0);
23289 // Let legalize expand this if it isn't a legal type yet.
23290 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23293 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23296 // Set the carry flag.
23297 SDValue Carry = Op.getOperand(2);
23298 EVT CarryVT = Carry.getValueType();
23299 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23300 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23301 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23303 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23304 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23305 Op.getOperand(1), Carry.getValue(1));
23307 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23308 if (N->getValueType(1) == MVT::i1)
23309 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23311 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23314 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23315 SelectionDAG &DAG) {
23316 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23318 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23319 // which returns the values as { float, float } (in XMM0) or
23320 // { double, double } (which is returned in XMM0, XMM1).
23322 SDValue Arg = Op.getOperand(0);
23323 EVT ArgVT = Arg.getValueType();
23324 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23326 TargetLowering::ArgListTy Args;
23327 TargetLowering::ArgListEntry Entry;
23331 Entry.IsSExt = false;
23332 Entry.IsZExt = false;
23333 Args.push_back(Entry);
23335 bool isF64 = ArgVT == MVT::f64;
23336 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23337 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23338 // the results are returned via SRet in memory.
23339 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23340 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23342 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23344 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23345 : (Type *)VectorType::get(ArgTy, 4);
23347 TargetLowering::CallLoweringInfo CLI(DAG);
23348 CLI.setDebugLoc(dl)
23349 .setChain(DAG.getEntryNode())
23350 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23352 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23355 // Returned in xmm0 and xmm1.
23356 return CallResult.first;
23358 // Returned in bits 0:31 and 32:64 xmm0.
23359 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23360 CallResult.first, DAG.getIntPtrConstant(0, dl));
23361 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23362 CallResult.first, DAG.getIntPtrConstant(1, dl));
23363 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23364 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23367 /// Widen a vector input to a vector of NVT. The
23368 /// input vector must have the same element type as NVT.
23369 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23370 bool FillWithZeroes = false) {
23371 // Check if InOp already has the right width.
23372 MVT InVT = InOp.getSimpleValueType();
23376 if (InOp.isUndef())
23377 return DAG.getUNDEF(NVT);
23379 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23380 "input and widen element type must match");
23382 unsigned InNumElts = InVT.getVectorNumElements();
23383 unsigned WidenNumElts = NVT.getVectorNumElements();
23384 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23385 "Unexpected request for vector widening");
23388 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23389 InOp.getNumOperands() == 2) {
23390 SDValue N1 = InOp.getOperand(1);
23391 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23393 InOp = InOp.getOperand(0);
23394 InVT = InOp.getSimpleValueType();
23395 InNumElts = InVT.getVectorNumElements();
23398 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23399 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23400 SmallVector<SDValue, 16> Ops;
23401 for (unsigned i = 0; i < InNumElts; ++i)
23402 Ops.push_back(InOp.getOperand(i));
23404 EVT EltVT = InOp.getOperand(0).getValueType();
23406 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23407 DAG.getUNDEF(EltVT);
23408 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23409 Ops.push_back(FillVal);
23410 return DAG.getBuildVector(NVT, dl, Ops);
23412 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23414 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23415 InOp, DAG.getIntPtrConstant(0, dl));
23418 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23419 SelectionDAG &DAG) {
23420 assert(Subtarget.hasAVX512() &&
23421 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23423 // X86 scatter kills mask register, so its type should be added to
23424 // the list of return values.
23425 // If the "scatter" has 2 return values, it is already handled.
23426 if (Op.getNode()->getNumValues() == 2)
23429 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23430 SDValue Src = N->getValue();
23431 MVT VT = Src.getSimpleValueType();
23432 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23435 SDValue NewScatter;
23436 SDValue Index = N->getIndex();
23437 SDValue Mask = N->getMask();
23438 SDValue Chain = N->getChain();
23439 SDValue BasePtr = N->getBasePtr();
23440 MVT MemVT = N->getMemoryVT().getSimpleVT();
23441 MVT IndexVT = Index.getSimpleValueType();
23442 MVT MaskVT = Mask.getSimpleValueType();
23444 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23445 // The v2i32 value was promoted to v2i64.
23446 // Now we "redo" the type legalizer's work and widen the original
23447 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23449 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23450 "Unexpected memory type");
23451 int ShuffleMask[] = {0, 2, -1, -1};
23452 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23453 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23454 // Now we have 4 elements instead of 2.
23455 // Expand the index.
23456 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23457 Index = ExtendToType(Index, NewIndexVT, DAG);
23459 // Expand the mask with zeroes
23460 // Mask may be <2 x i64> or <2 x i1> at this moment
23461 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23462 "Unexpected mask type");
23463 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23464 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23468 unsigned NumElts = VT.getVectorNumElements();
23469 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23470 !Index.getSimpleValueType().is512BitVector()) {
23471 // AVX512F supports only 512-bit vectors. Or data or index should
23472 // be 512 bit wide. If now the both index and data are 256-bit, but
23473 // the vector contains 8 elements, we just sign-extend the index
23474 if (IndexVT == MVT::v8i32)
23475 // Just extend index
23476 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23478 // The minimal number of elts in scatter is 8
23481 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23482 // Use original index here, do not modify the index twice
23483 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23484 if (IndexVT.getScalarType() == MVT::i32)
23485 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23488 // At this point we have promoted mask operand
23489 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23490 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23491 // Use the original mask here, do not modify the mask twice
23492 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23494 // The value that should be stored
23495 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23496 Src = ExtendToType(Src, NewVT, DAG);
23499 // If the mask is "wide" at this point - truncate it to i1 vector
23500 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23501 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23503 // The mask is killed by scatter, add it to the values
23504 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23505 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23506 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23507 N->getMemOperand());
23508 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23509 return SDValue(NewScatter.getNode(), 1);
23512 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23513 SelectionDAG &DAG) {
23515 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23516 MVT VT = Op.getSimpleValueType();
23517 MVT ScalarVT = VT.getScalarType();
23518 SDValue Mask = N->getMask();
23521 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23522 "Expanding masked load is supported on AVX-512 target only!");
23524 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23525 "Expanding masked load is supported for 32 and 64-bit types only!");
23527 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23528 // VLX. These types for exp-loads are handled here.
23529 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23532 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23533 "Cannot lower masked load op.");
23535 assert((ScalarVT.getSizeInBits() >= 32 ||
23536 (Subtarget.hasBWI() &&
23537 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23538 "Unsupported masked load op.");
23540 // This operation is legal for targets with VLX, but without
23541 // VLX the vector should be widened to 512 bit
23542 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23543 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23544 SDValue Src0 = N->getSrc0();
23545 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23547 // Mask element has to be i1.
23548 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23549 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23550 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23552 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23554 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23555 if (MaskEltTy != MVT::i1)
23556 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23557 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23558 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23559 N->getBasePtr(), Mask, Src0,
23560 N->getMemoryVT(), N->getMemOperand(),
23561 N->getExtensionType(),
23562 N->isExpandingLoad());
23564 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23565 NewLoad.getValue(0),
23566 DAG.getIntPtrConstant(0, dl));
23567 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23568 return DAG.getMergeValues(RetOps, dl);
23571 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23572 SelectionDAG &DAG) {
23573 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23574 SDValue DataToStore = N->getValue();
23575 MVT VT = DataToStore.getSimpleValueType();
23576 MVT ScalarVT = VT.getScalarType();
23577 SDValue Mask = N->getMask();
23580 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23581 "Expanding masked load is supported on AVX-512 target only!");
23583 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23584 "Expanding masked load is supported for 32 and 64-bit types only!");
23586 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23587 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23590 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23591 "Cannot lower masked store op.");
23593 assert((ScalarVT.getSizeInBits() >= 32 ||
23594 (Subtarget.hasBWI() &&
23595 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23596 "Unsupported masked store op.");
23598 // This operation is legal for targets with VLX, but without
23599 // VLX the vector should be widened to 512 bit
23600 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23601 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23603 // Mask element has to be i1.
23604 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23605 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23606 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23608 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23610 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23611 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23612 if (MaskEltTy != MVT::i1)
23613 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23614 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23615 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23616 Mask, N->getMemoryVT(), N->getMemOperand(),
23617 N->isTruncatingStore(), N->isCompressingStore());
23620 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23621 SelectionDAG &DAG) {
23622 assert(Subtarget.hasAVX512() &&
23623 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23625 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23627 MVT VT = Op.getSimpleValueType();
23628 SDValue Index = N->getIndex();
23629 SDValue Mask = N->getMask();
23630 SDValue Src0 = N->getValue();
23631 MVT IndexVT = Index.getSimpleValueType();
23632 MVT MaskVT = Mask.getSimpleValueType();
23634 unsigned NumElts = VT.getVectorNumElements();
23635 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23637 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23638 !Index.getSimpleValueType().is512BitVector()) {
23639 // AVX512F supports only 512-bit vectors. Or data or index should
23640 // be 512 bit wide. If now the both index and data are 256-bit, but
23641 // the vector contains 8 elements, we just sign-extend the index
23642 if (NumElts == 8) {
23643 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23644 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23645 N->getOperand(3), Index };
23646 DAG.UpdateNodeOperands(N, Ops);
23650 // Minimal number of elements in Gather
23653 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23654 Index = ExtendToType(Index, NewIndexVT, DAG);
23655 if (IndexVT.getScalarType() == MVT::i32)
23656 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23659 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23660 // At this point we have promoted mask operand
23661 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23662 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23663 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23664 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23666 // The pass-through value
23667 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23668 Src0 = ExtendToType(Src0, NewVT, DAG);
23670 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23671 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23672 N->getMemoryVT(), dl, Ops,
23673 N->getMemOperand());
23674 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23675 NewGather.getValue(0),
23676 DAG.getIntPtrConstant(0, dl));
23677 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23678 return DAG.getMergeValues(RetOps, dl);
23683 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23684 SelectionDAG &DAG) const {
23685 // TODO: Eventually, the lowering of these nodes should be informed by or
23686 // deferred to the GC strategy for the function in which they appear. For
23687 // now, however, they must be lowered to something. Since they are logically
23688 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23689 // require special handling for these nodes), lower them as literal NOOPs for
23691 SmallVector<SDValue, 2> Ops;
23693 Ops.push_back(Op.getOperand(0));
23694 if (Op->getGluedNode())
23695 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23698 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23699 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23704 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23705 SelectionDAG &DAG) const {
23706 // TODO: Eventually, the lowering of these nodes should be informed by or
23707 // deferred to the GC strategy for the function in which they appear. For
23708 // now, however, they must be lowered to something. Since they are logically
23709 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23710 // require special handling for these nodes), lower them as literal NOOPs for
23712 SmallVector<SDValue, 2> Ops;
23714 Ops.push_back(Op.getOperand(0));
23715 if (Op->getGluedNode())
23716 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23719 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23720 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23725 /// Provide custom lowering hooks for some operations.
23726 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23727 switch (Op.getOpcode()) {
23728 default: llvm_unreachable("Should not custom lower this!");
23729 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23730 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23731 return LowerCMP_SWAP(Op, Subtarget, DAG);
23732 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23733 case ISD::ATOMIC_LOAD_ADD:
23734 case ISD::ATOMIC_LOAD_SUB:
23735 case ISD::ATOMIC_LOAD_OR:
23736 case ISD::ATOMIC_LOAD_XOR:
23737 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23738 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23739 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23740 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23741 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23742 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23743 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23744 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23745 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23746 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23747 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23748 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23749 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23750 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23751 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23752 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23753 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23754 case ISD::SHL_PARTS:
23755 case ISD::SRA_PARTS:
23756 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23757 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23758 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23759 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23760 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23761 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23762 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23763 case ISD::ZERO_EXTEND_VECTOR_INREG:
23764 case ISD::SIGN_EXTEND_VECTOR_INREG:
23765 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23766 case ISD::FP_TO_SINT:
23767 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23768 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23769 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23771 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23772 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23773 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23774 case ISD::SETCC: return LowerSETCC(Op, DAG);
23775 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23776 case ISD::SELECT: return LowerSELECT(Op, DAG);
23777 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23778 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23779 case ISD::VASTART: return LowerVASTART(Op, DAG);
23780 case ISD::VAARG: return LowerVAARG(Op, DAG);
23781 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23782 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23783 case ISD::INTRINSIC_VOID:
23784 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23785 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23786 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23787 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23788 case ISD::FRAME_TO_ARGS_OFFSET:
23789 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23790 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23791 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23792 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23793 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23794 case ISD::EH_SJLJ_SETUP_DISPATCH:
23795 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23796 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23797 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23798 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23800 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23802 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23803 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23805 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23806 case ISD::UMUL_LOHI:
23807 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23808 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23811 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23817 case ISD::UMULO: return LowerXALUO(Op, DAG);
23818 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23819 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23823 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23824 case ISD::ADDCARRY:
23825 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
23827 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23831 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23832 case ISD::ABS: return LowerABS(Op, DAG);
23833 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23834 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23835 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23836 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23837 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23838 case ISD::GC_TRANSITION_START:
23839 return LowerGC_TRANSITION_START(Op, DAG);
23840 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23841 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23845 /// Places new result values for the node in Results (their number
23846 /// and types must exactly match those of the original return values of
23847 /// the node), or leaves Results empty, which indicates that the node is not
23848 /// to be custom lowered after all.
23849 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23850 SmallVectorImpl<SDValue> &Results,
23851 SelectionDAG &DAG) const {
23852 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23854 if (!Res.getNode())
23857 assert((N->getNumValues() <= Res->getNumValues()) &&
23858 "Lowering returned the wrong number of results!");
23860 // Places new result values base on N result number.
23861 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23862 // than original node, chain should be dropped(last value).
23863 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23864 Results.push_back(Res.getValue(I));
23867 /// Replace a node with an illegal result type with a new node built out of
23869 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23870 SmallVectorImpl<SDValue>&Results,
23871 SelectionDAG &DAG) const {
23873 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23874 switch (N->getOpcode()) {
23876 llvm_unreachable("Do not know how to custom type legalize this operation!");
23877 case X86ISD::AVG: {
23878 // Legalize types for X86ISD::AVG by expanding vectors.
23879 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23881 auto InVT = N->getValueType(0);
23882 auto InVTSize = InVT.getSizeInBits();
23883 const unsigned RegSize =
23884 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23885 assert((Subtarget.hasBWI() || RegSize < 512) &&
23886 "512-bit vector requires AVX512BW");
23887 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23888 "256-bit vector requires AVX2");
23890 auto ElemVT = InVT.getVectorElementType();
23891 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23892 RegSize / ElemVT.getSizeInBits());
23893 assert(RegSize % InVT.getSizeInBits() == 0);
23894 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23896 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23897 Ops[0] = N->getOperand(0);
23898 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23899 Ops[0] = N->getOperand(1);
23900 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23902 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23903 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23904 DAG.getIntPtrConstant(0, dl)));
23907 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23908 case X86ISD::FMINC:
23910 case X86ISD::FMAXC:
23911 case X86ISD::FMAX: {
23912 EVT VT = N->getValueType(0);
23913 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23914 SDValue UNDEF = DAG.getUNDEF(VT);
23915 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23916 N->getOperand(0), UNDEF);
23917 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23918 N->getOperand(1), UNDEF);
23919 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23927 case ISD::UDIVREM: {
23928 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23929 Results.push_back(V);
23932 case ISD::FP_TO_SINT:
23933 case ISD::FP_TO_UINT: {
23934 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23936 if (N->getValueType(0) == MVT::v2i32) {
23937 assert((IsSigned || Subtarget.hasAVX512()) &&
23938 "Can only handle signed conversion without AVX512");
23939 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23940 SDValue Src = N->getOperand(0);
23941 if (Src.getValueType() == MVT::v2f64) {
23942 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23943 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23944 : X86ISD::CVTTP2UI,
23945 dl, MVT::v4i32, Src);
23946 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23947 Results.push_back(Res);
23950 if (Src.getValueType() == MVT::v2f32) {
23951 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23952 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23953 DAG.getUNDEF(MVT::v2f32));
23954 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23955 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23956 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23957 Results.push_back(Res);
23961 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23962 // so early out here.
23966 std::pair<SDValue,SDValue> Vals =
23967 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23968 SDValue FIST = Vals.first, StackSlot = Vals.second;
23969 if (FIST.getNode()) {
23970 EVT VT = N->getValueType(0);
23971 // Return a load from the stack slot.
23972 if (StackSlot.getNode())
23974 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23976 Results.push_back(FIST);
23980 case ISD::SINT_TO_FP: {
23981 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23982 SDValue Src = N->getOperand(0);
23983 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23985 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23988 case ISD::UINT_TO_FP: {
23989 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23990 EVT VT = N->getValueType(0);
23991 if (VT != MVT::v2f32)
23993 SDValue Src = N->getOperand(0);
23994 EVT SrcVT = Src.getValueType();
23995 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23996 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23999 if (SrcVT != MVT::v2i32)
24001 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24003 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24004 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24005 DAG.getBitcast(MVT::v2i64, VBias));
24006 Or = DAG.getBitcast(MVT::v2f64, Or);
24007 // TODO: Are there any fast-math-flags to propagate here?
24008 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24009 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24012 case ISD::FP_ROUND: {
24013 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24015 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24016 Results.push_back(V);
24019 case ISD::FP_EXTEND: {
24020 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24021 // No other ValueType for FP_EXTEND should reach this point.
24022 assert(N->getValueType(0) == MVT::v2f32 &&
24023 "Do not know how to legalize this Node");
24026 case ISD::INTRINSIC_W_CHAIN: {
24027 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24029 default : llvm_unreachable("Do not know how to custom type "
24030 "legalize this intrinsic operation!");
24031 case Intrinsic::x86_rdtsc:
24032 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24034 case Intrinsic::x86_rdtscp:
24035 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24037 case Intrinsic::x86_rdpmc:
24038 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24040 case Intrinsic::x86_xgetbv:
24041 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24044 case ISD::INTRINSIC_WO_CHAIN: {
24045 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24046 Results.push_back(V);
24049 case ISD::READCYCLECOUNTER: {
24050 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24053 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24054 EVT T = N->getValueType(0);
24055 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24056 bool Regs64bit = T == MVT::i128;
24057 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24058 SDValue cpInL, cpInH;
24059 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24060 DAG.getConstant(0, dl, HalfT));
24061 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24062 DAG.getConstant(1, dl, HalfT));
24063 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24064 Regs64bit ? X86::RAX : X86::EAX,
24066 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24067 Regs64bit ? X86::RDX : X86::EDX,
24068 cpInH, cpInL.getValue(1));
24069 SDValue swapInL, swapInH;
24070 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24071 DAG.getConstant(0, dl, HalfT));
24072 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24073 DAG.getConstant(1, dl, HalfT));
24075 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24076 swapInH, cpInH.getValue(1));
24077 // If the current function needs the base pointer, RBX,
24078 // we shouldn't use cmpxchg directly.
24079 // Indeed the lowering of that instruction will clobber
24080 // that register and since RBX will be a reserved register
24081 // the register allocator will not make sure its value will
24082 // be properly saved and restored around this live-range.
24083 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24085 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24086 unsigned BasePtr = TRI->getBaseRegister();
24087 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24088 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24089 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24090 // ISel prefers the LCMPXCHG64 variant.
24091 // If that assert breaks, that means it is not the case anymore,
24092 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24093 // not just EBX. This is a matter of accepting i64 input for that
24094 // pseudo, and restoring into the register of the right wide
24095 // in expand pseudo. Everything else should just work.
24096 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24097 "Saving only half of the RBX");
24098 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24099 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24100 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24101 Regs64bit ? X86::RBX : X86::EBX,
24102 HalfT, swapInH.getValue(1));
24103 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24105 /*Glue*/ RBXSave.getValue(2)};
24106 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24109 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24110 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24111 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24112 swapInH.getValue(1));
24113 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24114 swapInL.getValue(1)};
24115 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24117 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24118 Regs64bit ? X86::RAX : X86::EAX,
24119 HalfT, Result.getValue(1));
24120 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24121 Regs64bit ? X86::RDX : X86::EDX,
24122 HalfT, cpOutL.getValue(2));
24123 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24125 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24126 MVT::i32, cpOutH.getValue(2));
24127 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24128 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24130 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24131 Results.push_back(Success);
24132 Results.push_back(EFLAGS.getValue(1));
24135 case ISD::ATOMIC_SWAP:
24136 case ISD::ATOMIC_LOAD_ADD:
24137 case ISD::ATOMIC_LOAD_SUB:
24138 case ISD::ATOMIC_LOAD_AND:
24139 case ISD::ATOMIC_LOAD_OR:
24140 case ISD::ATOMIC_LOAD_XOR:
24141 case ISD::ATOMIC_LOAD_NAND:
24142 case ISD::ATOMIC_LOAD_MIN:
24143 case ISD::ATOMIC_LOAD_MAX:
24144 case ISD::ATOMIC_LOAD_UMIN:
24145 case ISD::ATOMIC_LOAD_UMAX:
24146 case ISD::ATOMIC_LOAD: {
24147 // Delegate to generic TypeLegalization. Situations we can really handle
24148 // should have already been dealt with by AtomicExpandPass.cpp.
24151 case ISD::BITCAST: {
24152 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24153 EVT DstVT = N->getValueType(0);
24154 EVT SrcVT = N->getOperand(0)->getValueType(0);
24156 if (SrcVT != MVT::f64 ||
24157 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24160 unsigned NumElts = DstVT.getVectorNumElements();
24161 EVT SVT = DstVT.getVectorElementType();
24162 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24163 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24164 MVT::v2f64, N->getOperand(0));
24165 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24167 if (ExperimentalVectorWideningLegalization) {
24168 // If we are legalizing vectors by widening, we already have the desired
24169 // legal vector type, just return it.
24170 Results.push_back(ToVecInt);
24174 SmallVector<SDValue, 8> Elts;
24175 for (unsigned i = 0, e = NumElts; i != e; ++i)
24176 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24177 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24179 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24184 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24185 switch ((X86ISD::NodeType)Opcode) {
24186 case X86ISD::FIRST_NUMBER: break;
24187 case X86ISD::BSF: return "X86ISD::BSF";
24188 case X86ISD::BSR: return "X86ISD::BSR";
24189 case X86ISD::SHLD: return "X86ISD::SHLD";
24190 case X86ISD::SHRD: return "X86ISD::SHRD";
24191 case X86ISD::FAND: return "X86ISD::FAND";
24192 case X86ISD::FANDN: return "X86ISD::FANDN";
24193 case X86ISD::FOR: return "X86ISD::FOR";
24194 case X86ISD::FXOR: return "X86ISD::FXOR";
24195 case X86ISD::FILD: return "X86ISD::FILD";
24196 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24197 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24198 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24199 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24200 case X86ISD::FLD: return "X86ISD::FLD";
24201 case X86ISD::FST: return "X86ISD::FST";
24202 case X86ISD::CALL: return "X86ISD::CALL";
24203 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24204 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24205 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24206 case X86ISD::BT: return "X86ISD::BT";
24207 case X86ISD::CMP: return "X86ISD::CMP";
24208 case X86ISD::COMI: return "X86ISD::COMI";
24209 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24210 case X86ISD::CMPM: return "X86ISD::CMPM";
24211 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24212 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24213 case X86ISD::SETCC: return "X86ISD::SETCC";
24214 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24215 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24216 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24217 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24218 case X86ISD::CMOV: return "X86ISD::CMOV";
24219 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24220 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24221 case X86ISD::IRET: return "X86ISD::IRET";
24222 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24223 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24224 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24225 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24226 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24227 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24228 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24229 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24230 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24231 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24232 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24233 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24234 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24235 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24236 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24237 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24238 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24239 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24240 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24241 case X86ISD::HADD: return "X86ISD::HADD";
24242 case X86ISD::HSUB: return "X86ISD::HSUB";
24243 case X86ISD::FHADD: return "X86ISD::FHADD";
24244 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24245 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24246 case X86ISD::FMAX: return "X86ISD::FMAX";
24247 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24248 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24249 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24250 case X86ISD::FMIN: return "X86ISD::FMIN";
24251 case X86ISD::FMINS: return "X86ISD::FMINS";
24252 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24253 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24254 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24255 case X86ISD::FMINC: return "X86ISD::FMINC";
24256 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24257 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24258 case X86ISD::FRCP: return "X86ISD::FRCP";
24259 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24260 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24261 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24262 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24263 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24264 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24265 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24266 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24267 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24268 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24269 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24270 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24271 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24272 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24273 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24274 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24275 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24276 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24277 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24278 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24279 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24280 case X86ISD::LADD: return "X86ISD::LADD";
24281 case X86ISD::LSUB: return "X86ISD::LSUB";
24282 case X86ISD::LOR: return "X86ISD::LOR";
24283 case X86ISD::LXOR: return "X86ISD::LXOR";
24284 case X86ISD::LAND: return "X86ISD::LAND";
24285 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24286 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24287 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24288 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24289 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24290 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24291 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24292 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24293 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24294 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24295 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24296 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24297 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24298 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24299 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24300 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24301 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24302 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24303 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24304 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24305 case X86ISD::VSHL: return "X86ISD::VSHL";
24306 case X86ISD::VSRL: return "X86ISD::VSRL";
24307 case X86ISD::VSRA: return "X86ISD::VSRA";
24308 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24309 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24310 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24311 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24312 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24313 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24314 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24315 case X86ISD::CMPP: return "X86ISD::CMPP";
24316 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24317 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24318 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24319 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24320 case X86ISD::ADD: return "X86ISD::ADD";
24321 case X86ISD::SUB: return "X86ISD::SUB";
24322 case X86ISD::ADC: return "X86ISD::ADC";
24323 case X86ISD::SBB: return "X86ISD::SBB";
24324 case X86ISD::SMUL: return "X86ISD::SMUL";
24325 case X86ISD::UMUL: return "X86ISD::UMUL";
24326 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24327 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24328 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24329 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24330 case X86ISD::INC: return "X86ISD::INC";
24331 case X86ISD::DEC: return "X86ISD::DEC";
24332 case X86ISD::OR: return "X86ISD::OR";
24333 case X86ISD::XOR: return "X86ISD::XOR";
24334 case X86ISD::AND: return "X86ISD::AND";
24335 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24336 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24337 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24338 case X86ISD::PTEST: return "X86ISD::PTEST";
24339 case X86ISD::TESTP: return "X86ISD::TESTP";
24340 case X86ISD::TESTM: return "X86ISD::TESTM";
24341 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24342 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24343 case X86ISD::KTEST: return "X86ISD::KTEST";
24344 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24345 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24346 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24347 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24348 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24349 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24350 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24351 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24352 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24353 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24354 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24355 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24356 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24357 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24358 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24359 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24360 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24361 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24362 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24363 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24364 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24365 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24366 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24367 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24368 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24369 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24370 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24371 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24372 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24373 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24374 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24375 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24376 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24377 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24378 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24379 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24380 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24381 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24382 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24383 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24384 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24385 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24386 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24387 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24388 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24389 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24390 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24391 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24392 case X86ISD::SAHF: return "X86ISD::SAHF";
24393 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24394 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24395 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24396 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24397 case X86ISD::VPROT: return "X86ISD::VPROT";
24398 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24399 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24400 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24401 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24402 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24403 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24404 case X86ISD::FMADD: return "X86ISD::FMADD";
24405 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24406 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24407 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24408 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24409 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24410 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24411 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24412 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24413 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24414 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24415 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24416 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24417 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24418 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24419 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24420 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24421 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24422 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24423 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24424 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24425 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24426 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24427 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24428 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24429 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24430 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24431 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24432 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24433 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24434 case X86ISD::XTEST: return "X86ISD::XTEST";
24435 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24436 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24437 case X86ISD::SELECT: return "X86ISD::SELECT";
24438 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24439 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24440 case X86ISD::RCP28: return "X86ISD::RCP28";
24441 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24442 case X86ISD::EXP2: return "X86ISD::EXP2";
24443 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24444 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24445 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24446 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24447 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24448 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24449 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24450 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24451 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24452 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24453 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24454 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24455 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24456 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24457 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24458 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24459 case X86ISD::ADDS: return "X86ISD::ADDS";
24460 case X86ISD::SUBS: return "X86ISD::SUBS";
24461 case X86ISD::AVG: return "X86ISD::AVG";
24462 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24463 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24464 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24465 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24466 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24467 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24468 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24469 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24470 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24471 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24472 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24473 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24474 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24475 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24476 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24477 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24478 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24479 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24480 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24481 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24482 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24483 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24484 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24485 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24486 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24491 /// Return true if the addressing mode represented by AM is legal for this
24492 /// target, for a load/store of the specified type.
24493 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24494 const AddrMode &AM, Type *Ty,
24495 unsigned AS) const {
24496 // X86 supports extremely general addressing modes.
24497 CodeModel::Model M = getTargetMachine().getCodeModel();
24499 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24500 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24504 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24506 // If a reference to this global requires an extra load, we can't fold it.
24507 if (isGlobalStubReference(GVFlags))
24510 // If BaseGV requires a register for the PIC base, we cannot also have a
24511 // BaseReg specified.
24512 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24515 // If lower 4G is not available, then we must use rip-relative addressing.
24516 if ((M != CodeModel::Small || isPositionIndependent()) &&
24517 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24521 switch (AM.Scale) {
24527 // These scales always work.
24532 // These scales are formed with basereg+scalereg. Only accept if there is
24537 default: // Other stuff never works.
24544 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24545 unsigned Bits = Ty->getScalarSizeInBits();
24547 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24548 // particularly cheaper than those without.
24552 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24553 // variable shifts just as cheap as scalar ones.
24554 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24557 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24558 // fully general vector.
24562 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24563 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24565 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24566 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24567 return NumBits1 > NumBits2;
24570 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24571 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24574 if (!isTypeLegal(EVT::getEVT(Ty1)))
24577 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24579 // Assuming the caller doesn't have a zeroext or signext return parameter,
24580 // truncation all the way down to i1 is valid.
24584 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24585 return isInt<32>(Imm);
24588 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24589 // Can also use sub to handle negated immediates.
24590 return isInt<32>(Imm);
24593 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24594 if (!VT1.isInteger() || !VT2.isInteger())
24596 unsigned NumBits1 = VT1.getSizeInBits();
24597 unsigned NumBits2 = VT2.getSizeInBits();
24598 return NumBits1 > NumBits2;
24601 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24602 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24603 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24606 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24607 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24608 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24611 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24612 EVT VT1 = Val.getValueType();
24613 if (isZExtFree(VT1, VT2))
24616 if (Val.getOpcode() != ISD::LOAD)
24619 if (!VT1.isSimple() || !VT1.isInteger() ||
24620 !VT2.isSimple() || !VT2.isInteger())
24623 switch (VT1.getSimpleVT().SimpleTy) {
24628 // X86 has 8, 16, and 32-bit zero-extending loads.
24635 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24638 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24639 if (!Subtarget.hasAnyFMA())
24642 VT = VT.getScalarType();
24644 if (!VT.isSimple())
24647 switch (VT.getSimpleVT().SimpleTy) {
24658 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24659 // i16 instructions are longer (0x66 prefix) and potentially slower.
24660 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24663 /// Targets can use this to indicate that they only support *some*
24664 /// VECTOR_SHUFFLE operations, those with specific masks.
24665 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24666 /// are assumed to be legal.
24668 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24670 if (!VT.isSimple())
24673 // Not for i1 vectors
24674 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24677 // Very little shuffling can be done for 64-bit vectors right now.
24678 if (VT.getSimpleVT().getSizeInBits() == 64)
24681 // We only care that the types being shuffled are legal. The lowering can
24682 // handle any possible shuffle mask that results.
24683 return isTypeLegal(VT.getSimpleVT());
24687 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24689 // Just delegate to the generic legality, clear masks aren't special.
24690 return isShuffleMaskLegal(Mask, VT);
24693 //===----------------------------------------------------------------------===//
24694 // X86 Scheduler Hooks
24695 //===----------------------------------------------------------------------===//
24697 /// Utility function to emit xbegin specifying the start of an RTM region.
24698 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24699 const TargetInstrInfo *TII) {
24700 DebugLoc DL = MI.getDebugLoc();
24702 const BasicBlock *BB = MBB->getBasicBlock();
24703 MachineFunction::iterator I = ++MBB->getIterator();
24705 // For the v = xbegin(), we generate
24714 // eax = # XABORT_DEF
24718 // v = phi(s0/mainBB, s1/fallBB)
24720 MachineBasicBlock *thisMBB = MBB;
24721 MachineFunction *MF = MBB->getParent();
24722 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24723 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24724 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24725 MF->insert(I, mainMBB);
24726 MF->insert(I, fallMBB);
24727 MF->insert(I, sinkMBB);
24729 // Transfer the remainder of BB and its successor edges to sinkMBB.
24730 sinkMBB->splice(sinkMBB->begin(), MBB,
24731 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24732 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24734 MachineRegisterInfo &MRI = MF->getRegInfo();
24735 unsigned DstReg = MI.getOperand(0).getReg();
24736 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24737 unsigned mainDstReg = MRI.createVirtualRegister(RC);
24738 unsigned fallDstReg = MRI.createVirtualRegister(RC);
24742 // # fallthrough to mainMBB
24743 // # abortion to fallMBB
24744 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24745 thisMBB->addSuccessor(mainMBB);
24746 thisMBB->addSuccessor(fallMBB);
24749 // mainDstReg := -1
24750 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24751 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24752 mainMBB->addSuccessor(sinkMBB);
24755 // ; pseudo instruction to model hardware's definition from XABORT
24756 // EAX := XABORT_DEF
24757 // fallDstReg := EAX
24758 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24759 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24761 fallMBB->addSuccessor(sinkMBB);
24764 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24765 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24766 .addReg(mainDstReg).addMBB(mainMBB)
24767 .addReg(fallDstReg).addMBB(fallMBB);
24769 MI.eraseFromParent();
24773 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24774 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24775 // in the .td file.
24776 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24777 const TargetInstrInfo *TII) {
24779 switch (MI.getOpcode()) {
24780 default: llvm_unreachable("illegal opcode!");
24781 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24782 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24783 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24784 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24785 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24786 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24787 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24788 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24791 DebugLoc dl = MI.getDebugLoc();
24792 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24794 unsigned NumArgs = MI.getNumOperands();
24795 for (unsigned i = 1; i < NumArgs; ++i) {
24796 MachineOperand &Op = MI.getOperand(i);
24797 if (!(Op.isReg() && Op.isImplicit()))
24800 if (MI.hasOneMemOperand())
24801 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24803 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24804 .addReg(X86::XMM0);
24806 MI.eraseFromParent();
24810 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24811 // defs in an instruction pattern
24812 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24813 const TargetInstrInfo *TII) {
24815 switch (MI.getOpcode()) {
24816 default: llvm_unreachable("illegal opcode!");
24817 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24818 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24819 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24820 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24821 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24822 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24823 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24824 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24827 DebugLoc dl = MI.getDebugLoc();
24828 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24830 unsigned NumArgs = MI.getNumOperands(); // remove the results
24831 for (unsigned i = 1; i < NumArgs; ++i) {
24832 MachineOperand &Op = MI.getOperand(i);
24833 if (!(Op.isReg() && Op.isImplicit()))
24836 if (MI.hasOneMemOperand())
24837 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24839 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24842 MI.eraseFromParent();
24846 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24847 const X86Subtarget &Subtarget) {
24848 DebugLoc dl = MI.getDebugLoc();
24849 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24851 // insert input VAL into EAX
24852 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24853 .addReg(MI.getOperand(0).getReg());
24854 // insert zero to ECX
24855 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24857 // insert zero to EDX
24858 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24860 // insert WRPKRU instruction
24861 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24863 MI.eraseFromParent(); // The pseudo is gone now.
24867 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24868 const X86Subtarget &Subtarget) {
24869 DebugLoc dl = MI.getDebugLoc();
24870 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24872 // insert zero to ECX
24873 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24875 // insert RDPKRU instruction
24876 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24877 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24880 MI.eraseFromParent(); // The pseudo is gone now.
24884 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24885 const X86Subtarget &Subtarget,
24887 DebugLoc dl = MI.getDebugLoc();
24888 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24889 // Address into RAX/EAX, other two args into ECX, EDX.
24890 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24891 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24892 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24893 for (int i = 0; i < X86::AddrNumOperands; ++i)
24894 MIB.add(MI.getOperand(i));
24896 unsigned ValOps = X86::AddrNumOperands;
24897 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24898 .addReg(MI.getOperand(ValOps).getReg());
24899 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24900 .addReg(MI.getOperand(ValOps + 1).getReg());
24902 // The instruction doesn't actually take any operands though.
24903 BuildMI(*BB, MI, dl, TII->get(Opc));
24905 MI.eraseFromParent(); // The pseudo is gone now.
24909 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24910 const X86Subtarget &Subtarget) {
24911 DebugLoc dl = MI->getDebugLoc();
24912 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24913 // Address into RAX/EAX
24914 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24915 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24916 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24917 for (int i = 0; i < X86::AddrNumOperands; ++i)
24918 MIB.add(MI->getOperand(i));
24920 // The instruction doesn't actually take any operands though.
24921 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24923 MI->eraseFromParent(); // The pseudo is gone now.
24929 MachineBasicBlock *
24930 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24931 MachineBasicBlock *MBB) const {
24932 // Emit va_arg instruction on X86-64.
24934 // Operands to this pseudo-instruction:
24935 // 0 ) Output : destination address (reg)
24936 // 1-5) Input : va_list address (addr, i64mem)
24937 // 6 ) ArgSize : Size (in bytes) of vararg type
24938 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24939 // 8 ) Align : Alignment of type
24940 // 9 ) EFLAGS (implicit-def)
24942 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24943 static_assert(X86::AddrNumOperands == 5,
24944 "VAARG_64 assumes 5 address operands");
24946 unsigned DestReg = MI.getOperand(0).getReg();
24947 MachineOperand &Base = MI.getOperand(1);
24948 MachineOperand &Scale = MI.getOperand(2);
24949 MachineOperand &Index = MI.getOperand(3);
24950 MachineOperand &Disp = MI.getOperand(4);
24951 MachineOperand &Segment = MI.getOperand(5);
24952 unsigned ArgSize = MI.getOperand(6).getImm();
24953 unsigned ArgMode = MI.getOperand(7).getImm();
24954 unsigned Align = MI.getOperand(8).getImm();
24956 // Memory Reference
24957 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24958 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24959 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24961 // Machine Information
24962 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24963 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24964 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24965 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24966 DebugLoc DL = MI.getDebugLoc();
24968 // struct va_list {
24971 // i64 overflow_area (address)
24972 // i64 reg_save_area (address)
24974 // sizeof(va_list) = 24
24975 // alignment(va_list) = 8
24977 unsigned TotalNumIntRegs = 6;
24978 unsigned TotalNumXMMRegs = 8;
24979 bool UseGPOffset = (ArgMode == 1);
24980 bool UseFPOffset = (ArgMode == 2);
24981 unsigned MaxOffset = TotalNumIntRegs * 8 +
24982 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24984 /* Align ArgSize to a multiple of 8 */
24985 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24986 bool NeedsAlign = (Align > 8);
24988 MachineBasicBlock *thisMBB = MBB;
24989 MachineBasicBlock *overflowMBB;
24990 MachineBasicBlock *offsetMBB;
24991 MachineBasicBlock *endMBB;
24993 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24994 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24995 unsigned OffsetReg = 0;
24997 if (!UseGPOffset && !UseFPOffset) {
24998 // If we only pull from the overflow region, we don't create a branch.
24999 // We don't need to alter control flow.
25000 OffsetDestReg = 0; // unused
25001 OverflowDestReg = DestReg;
25003 offsetMBB = nullptr;
25004 overflowMBB = thisMBB;
25007 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25008 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25009 // If not, pull from overflow_area. (branch to overflowMBB)
25014 // offsetMBB overflowMBB
25019 // Registers for the PHI in endMBB
25020 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25021 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25023 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25024 MachineFunction *MF = MBB->getParent();
25025 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25026 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25027 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25029 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25031 // Insert the new basic blocks
25032 MF->insert(MBBIter, offsetMBB);
25033 MF->insert(MBBIter, overflowMBB);
25034 MF->insert(MBBIter, endMBB);
25036 // Transfer the remainder of MBB and its successor edges to endMBB.
25037 endMBB->splice(endMBB->begin(), thisMBB,
25038 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25039 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25041 // Make offsetMBB and overflowMBB successors of thisMBB
25042 thisMBB->addSuccessor(offsetMBB);
25043 thisMBB->addSuccessor(overflowMBB);
25045 // endMBB is a successor of both offsetMBB and overflowMBB
25046 offsetMBB->addSuccessor(endMBB);
25047 overflowMBB->addSuccessor(endMBB);
25049 // Load the offset value into a register
25050 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25051 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25055 .addDisp(Disp, UseFPOffset ? 4 : 0)
25057 .setMemRefs(MMOBegin, MMOEnd);
25059 // Check if there is enough room left to pull this argument.
25060 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25062 .addImm(MaxOffset + 8 - ArgSizeA8);
25064 // Branch to "overflowMBB" if offset >= max
25065 // Fall through to "offsetMBB" otherwise
25066 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25067 .addMBB(overflowMBB);
25070 // In offsetMBB, emit code to use the reg_save_area.
25072 assert(OffsetReg != 0);
25074 // Read the reg_save_area address.
25075 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25076 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25082 .setMemRefs(MMOBegin, MMOEnd);
25084 // Zero-extend the offset
25085 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25086 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25089 .addImm(X86::sub_32bit);
25091 // Add the offset to the reg_save_area to get the final address.
25092 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25093 .addReg(OffsetReg64)
25094 .addReg(RegSaveReg);
25096 // Compute the offset for the next argument
25097 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25098 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25100 .addImm(UseFPOffset ? 16 : 8);
25102 // Store it back into the va_list.
25103 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25107 .addDisp(Disp, UseFPOffset ? 4 : 0)
25109 .addReg(NextOffsetReg)
25110 .setMemRefs(MMOBegin, MMOEnd);
25113 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25118 // Emit code to use overflow area
25121 // Load the overflow_area address into a register.
25122 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25123 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25129 .setMemRefs(MMOBegin, MMOEnd);
25131 // If we need to align it, do so. Otherwise, just copy the address
25132 // to OverflowDestReg.
25134 // Align the overflow address
25135 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25136 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25138 // aligned_addr = (addr + (align-1)) & ~(align-1)
25139 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25140 .addReg(OverflowAddrReg)
25143 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25145 .addImm(~(uint64_t)(Align-1));
25147 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25148 .addReg(OverflowAddrReg);
25151 // Compute the next overflow address after this argument.
25152 // (the overflow address should be kept 8-byte aligned)
25153 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25154 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25155 .addReg(OverflowDestReg)
25156 .addImm(ArgSizeA8);
25158 // Store the new overflow address.
25159 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25165 .addReg(NextAddrReg)
25166 .setMemRefs(MMOBegin, MMOEnd);
25168 // If we branched, emit the PHI to the front of endMBB.
25170 BuildMI(*endMBB, endMBB->begin(), DL,
25171 TII->get(X86::PHI), DestReg)
25172 .addReg(OffsetDestReg).addMBB(offsetMBB)
25173 .addReg(OverflowDestReg).addMBB(overflowMBB);
25176 // Erase the pseudo instruction
25177 MI.eraseFromParent();
25182 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25183 MachineInstr &MI, MachineBasicBlock *MBB) const {
25184 // Emit code to save XMM registers to the stack. The ABI says that the
25185 // number of registers to save is given in %al, so it's theoretically
25186 // possible to do an indirect jump trick to avoid saving all of them,
25187 // however this code takes a simpler approach and just executes all
25188 // of the stores if %al is non-zero. It's less code, and it's probably
25189 // easier on the hardware branch predictor, and stores aren't all that
25190 // expensive anyway.
25192 // Create the new basic blocks. One block contains all the XMM stores,
25193 // and one block is the final destination regardless of whether any
25194 // stores were performed.
25195 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25196 MachineFunction *F = MBB->getParent();
25197 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25198 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25199 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25200 F->insert(MBBIter, XMMSaveMBB);
25201 F->insert(MBBIter, EndMBB);
25203 // Transfer the remainder of MBB and its successor edges to EndMBB.
25204 EndMBB->splice(EndMBB->begin(), MBB,
25205 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25206 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25208 // The original block will now fall through to the XMM save block.
25209 MBB->addSuccessor(XMMSaveMBB);
25210 // The XMMSaveMBB will fall through to the end block.
25211 XMMSaveMBB->addSuccessor(EndMBB);
25213 // Now add the instructions.
25214 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25215 DebugLoc DL = MI.getDebugLoc();
25217 unsigned CountReg = MI.getOperand(0).getReg();
25218 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25219 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25221 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25222 // If %al is 0, branch around the XMM save block.
25223 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25224 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25225 MBB->addSuccessor(EndMBB);
25228 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25229 // that was just emitted, but clearly shouldn't be "saved".
25230 assert((MI.getNumOperands() <= 3 ||
25231 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25232 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25233 "Expected last argument to be EFLAGS");
25234 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25235 // In the XMM save block, save all the XMM argument registers.
25236 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25237 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25238 MachineMemOperand *MMO = F->getMachineMemOperand(
25239 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25240 MachineMemOperand::MOStore,
25241 /*Size=*/16, /*Align=*/16);
25242 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25243 .addFrameIndex(RegSaveFrameIndex)
25244 .addImm(/*Scale=*/1)
25245 .addReg(/*IndexReg=*/0)
25246 .addImm(/*Disp=*/Offset)
25247 .addReg(/*Segment=*/0)
25248 .addReg(MI.getOperand(i).getReg())
25249 .addMemOperand(MMO);
25252 MI.eraseFromParent(); // The pseudo instruction is gone now.
25257 // The EFLAGS operand of SelectItr might be missing a kill marker
25258 // because there were multiple uses of EFLAGS, and ISel didn't know
25259 // which to mark. Figure out whether SelectItr should have had a
25260 // kill marker, and set it if it should. Returns the correct kill
25262 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25263 MachineBasicBlock* BB,
25264 const TargetRegisterInfo* TRI) {
25265 // Scan forward through BB for a use/def of EFLAGS.
25266 MachineBasicBlock::iterator miI(std::next(SelectItr));
25267 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25268 const MachineInstr& mi = *miI;
25269 if (mi.readsRegister(X86::EFLAGS))
25271 if (mi.definesRegister(X86::EFLAGS))
25272 break; // Should have kill-flag - update below.
25275 // If we hit the end of the block, check whether EFLAGS is live into a
25277 if (miI == BB->end()) {
25278 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25279 sEnd = BB->succ_end();
25280 sItr != sEnd; ++sItr) {
25281 MachineBasicBlock* succ = *sItr;
25282 if (succ->isLiveIn(X86::EFLAGS))
25287 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25288 // out. SelectMI should have a kill flag on EFLAGS.
25289 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25293 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25294 // together with other CMOV pseudo-opcodes into a single basic-block with
25295 // conditional jump around it.
25296 static bool isCMOVPseudo(MachineInstr &MI) {
25297 switch (MI.getOpcode()) {
25298 case X86::CMOV_FR32:
25299 case X86::CMOV_FR64:
25300 case X86::CMOV_GR8:
25301 case X86::CMOV_GR16:
25302 case X86::CMOV_GR32:
25303 case X86::CMOV_RFP32:
25304 case X86::CMOV_RFP64:
25305 case X86::CMOV_RFP80:
25306 case X86::CMOV_V2F64:
25307 case X86::CMOV_V2I64:
25308 case X86::CMOV_V4F32:
25309 case X86::CMOV_V4F64:
25310 case X86::CMOV_V4I64:
25311 case X86::CMOV_V16F32:
25312 case X86::CMOV_V8F32:
25313 case X86::CMOV_V8F64:
25314 case X86::CMOV_V8I64:
25315 case X86::CMOV_V8I1:
25316 case X86::CMOV_V16I1:
25317 case X86::CMOV_V32I1:
25318 case X86::CMOV_V64I1:
25326 MachineBasicBlock *
25327 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25328 MachineBasicBlock *BB) const {
25329 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25330 DebugLoc DL = MI.getDebugLoc();
25332 // To "insert" a SELECT_CC instruction, we actually have to insert the
25333 // diamond control-flow pattern. The incoming instruction knows the
25334 // destination vreg to set, the condition code register to branch on, the
25335 // true/false values to select between, and a branch opcode to use.
25336 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25337 MachineFunction::iterator It = ++BB->getIterator();
25342 // cmpTY ccX, r1, r2
25344 // fallthrough --> copy0MBB
25345 MachineBasicBlock *thisMBB = BB;
25346 MachineFunction *F = BB->getParent();
25348 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25349 // as described above, by inserting a BB, and then making a PHI at the join
25350 // point to select the true and false operands of the CMOV in the PHI.
25352 // The code also handles two different cases of multiple CMOV opcodes
25356 // In this case, there are multiple CMOVs in a row, all which are based on
25357 // the same condition setting (or the exact opposite condition setting).
25358 // In this case we can lower all the CMOVs using a single inserted BB, and
25359 // then make a number of PHIs at the join point to model the CMOVs. The only
25360 // trickiness here, is that in a case like:
25362 // t2 = CMOV cond1 t1, f1
25363 // t3 = CMOV cond1 t2, f2
25365 // when rewriting this into PHIs, we have to perform some renaming on the
25366 // temps since you cannot have a PHI operand refer to a PHI result earlier
25367 // in the same block. The "simple" but wrong lowering would be:
25369 // t2 = PHI t1(BB1), f1(BB2)
25370 // t3 = PHI t2(BB1), f2(BB2)
25372 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25373 // renaming is to note that on the path through BB1, t2 is really just a
25374 // copy of t1, and do that renaming, properly generating:
25376 // t2 = PHI t1(BB1), f1(BB2)
25377 // t3 = PHI t1(BB1), f2(BB2)
25379 // Case 2, we lower cascaded CMOVs such as
25381 // (CMOV (CMOV F, T, cc1), T, cc2)
25383 // to two successive branches. For that, we look for another CMOV as the
25384 // following instruction.
25386 // Without this, we would add a PHI between the two jumps, which ends up
25387 // creating a few copies all around. For instance, for
25389 // (sitofp (zext (fcmp une)))
25391 // we would generate:
25393 // ucomiss %xmm1, %xmm0
25394 // movss <1.0f>, %xmm0
25395 // movaps %xmm0, %xmm1
25397 // xorps %xmm1, %xmm1
25400 // movaps %xmm1, %xmm0
25404 // because this custom-inserter would have generated:
25416 // A: X = ...; Y = ...
25418 // C: Z = PHI [X, A], [Y, B]
25420 // E: PHI [X, C], [Z, D]
25422 // If we lower both CMOVs in a single step, we can instead generate:
25434 // A: X = ...; Y = ...
25436 // E: PHI [X, A], [X, C], [Y, D]
25438 // Which, in our sitofp/fcmp example, gives us something like:
25440 // ucomiss %xmm1, %xmm0
25441 // movss <1.0f>, %xmm0
25444 // xorps %xmm0, %xmm0
25448 MachineInstr *CascadedCMOV = nullptr;
25449 MachineInstr *LastCMOV = &MI;
25450 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25451 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25452 MachineBasicBlock::iterator NextMIIt =
25453 std::next(MachineBasicBlock::iterator(MI));
25455 // Check for case 1, where there are multiple CMOVs with the same condition
25456 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25457 // number of jumps the most.
25459 if (isCMOVPseudo(MI)) {
25460 // See if we have a string of CMOVS with the same condition.
25461 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25462 (NextMIIt->getOperand(3).getImm() == CC ||
25463 NextMIIt->getOperand(3).getImm() == OppCC)) {
25464 LastCMOV = &*NextMIIt;
25469 // This checks for case 2, but only do this if we didn't already find
25470 // case 1, as indicated by LastCMOV == MI.
25471 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25472 NextMIIt->getOpcode() == MI.getOpcode() &&
25473 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25474 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25475 NextMIIt->getOperand(1).isKill()) {
25476 CascadedCMOV = &*NextMIIt;
25479 MachineBasicBlock *jcc1MBB = nullptr;
25481 // If we have a cascaded CMOV, we lower it to two successive branches to
25482 // the same block. EFLAGS is used by both, so mark it as live in the second.
25483 if (CascadedCMOV) {
25484 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25485 F->insert(It, jcc1MBB);
25486 jcc1MBB->addLiveIn(X86::EFLAGS);
25489 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25490 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25491 F->insert(It, copy0MBB);
25492 F->insert(It, sinkMBB);
25494 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25495 // live into the sink and copy blocks.
25496 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25498 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25499 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25500 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25501 copy0MBB->addLiveIn(X86::EFLAGS);
25502 sinkMBB->addLiveIn(X86::EFLAGS);
25505 // Transfer the remainder of BB and its successor edges to sinkMBB.
25506 sinkMBB->splice(sinkMBB->begin(), BB,
25507 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25508 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25510 // Add the true and fallthrough blocks as its successors.
25511 if (CascadedCMOV) {
25512 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25513 BB->addSuccessor(jcc1MBB);
25515 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25516 // jump to the sinkMBB.
25517 jcc1MBB->addSuccessor(copy0MBB);
25518 jcc1MBB->addSuccessor(sinkMBB);
25520 BB->addSuccessor(copy0MBB);
25523 // The true block target of the first (or only) branch is always sinkMBB.
25524 BB->addSuccessor(sinkMBB);
25526 // Create the conditional branch instruction.
25527 unsigned Opc = X86::GetCondBranchFromCond(CC);
25528 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25530 if (CascadedCMOV) {
25531 unsigned Opc2 = X86::GetCondBranchFromCond(
25532 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25533 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25537 // %FalseValue = ...
25538 // # fallthrough to sinkMBB
25539 copy0MBB->addSuccessor(sinkMBB);
25542 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25544 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25545 MachineBasicBlock::iterator MIItEnd =
25546 std::next(MachineBasicBlock::iterator(LastCMOV));
25547 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25548 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25549 MachineInstrBuilder MIB;
25551 // As we are creating the PHIs, we have to be careful if there is more than
25552 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25553 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25554 // That also means that PHI construction must work forward from earlier to
25555 // later, and that the code must maintain a mapping from earlier PHI's
25556 // destination registers, and the registers that went into the PHI.
25558 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25559 unsigned DestReg = MIIt->getOperand(0).getReg();
25560 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25561 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25563 // If this CMOV we are generating is the opposite condition from
25564 // the jump we generated, then we have to swap the operands for the
25565 // PHI that is going to be generated.
25566 if (MIIt->getOperand(3).getImm() == OppCC)
25567 std::swap(Op1Reg, Op2Reg);
25569 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25570 Op1Reg = RegRewriteTable[Op1Reg].first;
25572 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25573 Op2Reg = RegRewriteTable[Op2Reg].second;
25575 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25576 TII->get(X86::PHI), DestReg)
25577 .addReg(Op1Reg).addMBB(copy0MBB)
25578 .addReg(Op2Reg).addMBB(thisMBB);
25580 // Add this PHI to the rewrite table.
25581 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25584 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25585 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25586 if (CascadedCMOV) {
25587 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25588 // Copy the PHI result to the register defined by the second CMOV.
25589 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25590 DL, TII->get(TargetOpcode::COPY),
25591 CascadedCMOV->getOperand(0).getReg())
25592 .addReg(MI.getOperand(0).getReg());
25593 CascadedCMOV->eraseFromParent();
25596 // Now remove the CMOV(s).
25597 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25598 (MIIt++)->eraseFromParent();
25603 MachineBasicBlock *
25604 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25605 MachineBasicBlock *BB) const {
25606 // Combine the following atomic floating-point modification pattern:
25607 // a.store(reg OP a.load(acquire), release)
25608 // Transform them into:
25609 // OPss (%gpr), %xmm
25610 // movss %xmm, (%gpr)
25611 // Or sd equivalent for 64-bit operations.
25613 switch (MI.getOpcode()) {
25614 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25615 case X86::RELEASE_FADD32mr:
25616 FOp = X86::ADDSSrm;
25617 MOp = X86::MOVSSmr;
25619 case X86::RELEASE_FADD64mr:
25620 FOp = X86::ADDSDrm;
25621 MOp = X86::MOVSDmr;
25624 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25625 DebugLoc DL = MI.getDebugLoc();
25626 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25627 unsigned ValOpIdx = X86::AddrNumOperands;
25628 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25629 MachineInstrBuilder MIB =
25630 BuildMI(*BB, MI, DL, TII->get(FOp),
25631 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25633 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25634 MachineOperand &Operand = MI.getOperand(i);
25635 // Clear any kill flags on register operands as we'll create a second
25636 // instruction using the same address operands.
25637 if (Operand.isReg())
25638 Operand.setIsKill(false);
25641 MachineInstr *FOpMI = MIB;
25642 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25643 for (int i = 0; i < X86::AddrNumOperands; ++i)
25644 MIB.add(MI.getOperand(i));
25645 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25646 MI.eraseFromParent(); // The pseudo instruction is gone now.
25650 MachineBasicBlock *
25651 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25652 MachineBasicBlock *BB) const {
25653 MachineFunction *MF = BB->getParent();
25654 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25655 DebugLoc DL = MI.getDebugLoc();
25656 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25658 assert(MF->shouldSplitStack());
25660 const bool Is64Bit = Subtarget.is64Bit();
25661 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25663 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25664 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25667 // ... [Till the alloca]
25668 // If stacklet is not large enough, jump to mallocMBB
25671 // Allocate by subtracting from RSP
25672 // Jump to continueMBB
25675 // Allocate by call to runtime
25679 // [rest of original BB]
25682 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25683 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25684 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25686 MachineRegisterInfo &MRI = MF->getRegInfo();
25687 const TargetRegisterClass *AddrRegClass =
25688 getRegClassFor(getPointerTy(MF->getDataLayout()));
25690 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25691 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25692 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25693 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25694 sizeVReg = MI.getOperand(1).getReg(),
25696 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25698 MachineFunction::iterator MBBIter = ++BB->getIterator();
25700 MF->insert(MBBIter, bumpMBB);
25701 MF->insert(MBBIter, mallocMBB);
25702 MF->insert(MBBIter, continueMBB);
25704 continueMBB->splice(continueMBB->begin(), BB,
25705 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25706 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25708 // Add code to the main basic block to check if the stack limit has been hit,
25709 // and if so, jump to mallocMBB otherwise to bumpMBB.
25710 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25711 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25712 .addReg(tmpSPVReg).addReg(sizeVReg);
25713 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25714 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25715 .addReg(SPLimitVReg);
25716 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25718 // bumpMBB simply decreases the stack pointer, since we know the current
25719 // stacklet has enough space.
25720 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25721 .addReg(SPLimitVReg);
25722 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25723 .addReg(SPLimitVReg);
25724 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25726 // Calls into a routine in libgcc to allocate more space from the heap.
25727 const uint32_t *RegMask =
25728 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25730 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25732 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25733 .addExternalSymbol("__morestack_allocate_stack_space")
25734 .addRegMask(RegMask)
25735 .addReg(X86::RDI, RegState::Implicit)
25736 .addReg(X86::RAX, RegState::ImplicitDefine);
25737 } else if (Is64Bit) {
25738 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25740 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25741 .addExternalSymbol("__morestack_allocate_stack_space")
25742 .addRegMask(RegMask)
25743 .addReg(X86::EDI, RegState::Implicit)
25744 .addReg(X86::EAX, RegState::ImplicitDefine);
25746 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25748 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25749 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25750 .addExternalSymbol("__morestack_allocate_stack_space")
25751 .addRegMask(RegMask)
25752 .addReg(X86::EAX, RegState::ImplicitDefine);
25756 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25759 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25760 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25761 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25763 // Set up the CFG correctly.
25764 BB->addSuccessor(bumpMBB);
25765 BB->addSuccessor(mallocMBB);
25766 mallocMBB->addSuccessor(continueMBB);
25767 bumpMBB->addSuccessor(continueMBB);
25769 // Take care of the PHI nodes.
25770 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25771 MI.getOperand(0).getReg())
25772 .addReg(mallocPtrVReg)
25774 .addReg(bumpSPPtrVReg)
25777 // Delete the original pseudo instruction.
25778 MI.eraseFromParent();
25781 return continueMBB;
25784 MachineBasicBlock *
25785 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25786 MachineBasicBlock *BB) const {
25787 MachineFunction *MF = BB->getParent();
25788 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25789 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25790 DebugLoc DL = MI.getDebugLoc();
25792 assert(!isAsynchronousEHPersonality(
25793 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25794 "SEH does not use catchret!");
25796 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25797 if (!Subtarget.is32Bit())
25800 // C++ EH creates a new target block to hold the restore code, and wires up
25801 // the new block to the return destination with a normal JMP_4.
25802 MachineBasicBlock *RestoreMBB =
25803 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25804 assert(BB->succ_size() == 1);
25805 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25806 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25807 BB->addSuccessor(RestoreMBB);
25808 MI.getOperand(0).setMBB(RestoreMBB);
25810 auto RestoreMBBI = RestoreMBB->begin();
25811 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25812 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25816 MachineBasicBlock *
25817 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25818 MachineBasicBlock *BB) const {
25819 MachineFunction *MF = BB->getParent();
25820 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25821 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25822 // Only 32-bit SEH requires special handling for catchpad.
25823 if (IsSEH && Subtarget.is32Bit()) {
25824 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25825 DebugLoc DL = MI.getDebugLoc();
25826 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25828 MI.eraseFromParent();
25832 MachineBasicBlock *
25833 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25834 MachineBasicBlock *BB) const {
25835 // So, here we replace TLSADDR with the sequence:
25836 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25837 // We need this because TLSADDR is lowered into calls
25838 // inside MC, therefore without the two markers shrink-wrapping
25839 // may push the prologue/epilogue pass them.
25840 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25841 DebugLoc DL = MI.getDebugLoc();
25842 MachineFunction &MF = *BB->getParent();
25844 // Emit CALLSEQ_START right before the instruction.
25845 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25846 MachineInstrBuilder CallseqStart =
25847 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
25848 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25850 // Emit CALLSEQ_END right after the instruction.
25851 // We don't call erase from parent because we want to keep the
25852 // original instruction around.
25853 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25854 MachineInstrBuilder CallseqEnd =
25855 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25856 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25861 MachineBasicBlock *
25862 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25863 MachineBasicBlock *BB) const {
25864 // This is pretty easy. We're taking the value that we received from
25865 // our load from the relocation, sticking it in either RDI (x86-64)
25866 // or EAX and doing an indirect call. The return value will then
25867 // be in the normal return register.
25868 MachineFunction *F = BB->getParent();
25869 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25870 DebugLoc DL = MI.getDebugLoc();
25872 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25873 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25875 // Get a register mask for the lowered call.
25876 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25877 // proper register mask.
25878 const uint32_t *RegMask =
25879 Subtarget.is64Bit() ?
25880 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25881 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25882 if (Subtarget.is64Bit()) {
25883 MachineInstrBuilder MIB =
25884 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25888 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25889 MI.getOperand(3).getTargetFlags())
25891 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25892 addDirectMem(MIB, X86::RDI);
25893 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25894 } else if (!isPositionIndependent()) {
25895 MachineInstrBuilder MIB =
25896 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25900 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25901 MI.getOperand(3).getTargetFlags())
25903 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25904 addDirectMem(MIB, X86::EAX);
25905 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25907 MachineInstrBuilder MIB =
25908 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25909 .addReg(TII->getGlobalBaseReg(F))
25912 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25913 MI.getOperand(3).getTargetFlags())
25915 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25916 addDirectMem(MIB, X86::EAX);
25917 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25920 MI.eraseFromParent(); // The pseudo instruction is gone now.
25924 MachineBasicBlock *
25925 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25926 MachineBasicBlock *MBB) const {
25927 DebugLoc DL = MI.getDebugLoc();
25928 MachineFunction *MF = MBB->getParent();
25929 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25930 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25931 MachineRegisterInfo &MRI = MF->getRegInfo();
25933 const BasicBlock *BB = MBB->getBasicBlock();
25934 MachineFunction::iterator I = ++MBB->getIterator();
25936 // Memory Reference
25937 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25938 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25941 unsigned MemOpndSlot = 0;
25943 unsigned CurOp = 0;
25945 DstReg = MI.getOperand(CurOp++).getReg();
25946 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25947 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25949 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25950 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25952 MemOpndSlot = CurOp;
25954 MVT PVT = getPointerTy(MF->getDataLayout());
25955 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25956 "Invalid Pointer Size!");
25958 // For v = setjmp(buf), we generate
25961 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25962 // SjLjSetup restoreMBB
25968 // v = phi(main, restore)
25971 // if base pointer being used, load it from frame
25974 MachineBasicBlock *thisMBB = MBB;
25975 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25976 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25977 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25978 MF->insert(I, mainMBB);
25979 MF->insert(I, sinkMBB);
25980 MF->push_back(restoreMBB);
25981 restoreMBB->setHasAddressTaken();
25983 MachineInstrBuilder MIB;
25985 // Transfer the remainder of BB and its successor edges to sinkMBB.
25986 sinkMBB->splice(sinkMBB->begin(), MBB,
25987 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25988 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25991 unsigned PtrStoreOpc = 0;
25992 unsigned LabelReg = 0;
25993 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25994 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25995 !isPositionIndependent();
25997 // Prepare IP either in reg or imm.
25998 if (!UseImmLabel) {
25999 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26000 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26001 LabelReg = MRI.createVirtualRegister(PtrRC);
26002 if (Subtarget.is64Bit()) {
26003 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26007 .addMBB(restoreMBB)
26010 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26011 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26012 .addReg(XII->getGlobalBaseReg(MF))
26015 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26019 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26021 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26022 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26023 if (i == X86::AddrDisp)
26024 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26026 MIB.add(MI.getOperand(MemOpndSlot + i));
26029 MIB.addReg(LabelReg);
26031 MIB.addMBB(restoreMBB);
26032 MIB.setMemRefs(MMOBegin, MMOEnd);
26034 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26035 .addMBB(restoreMBB);
26037 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26038 MIB.addRegMask(RegInfo->getNoPreservedMask());
26039 thisMBB->addSuccessor(mainMBB);
26040 thisMBB->addSuccessor(restoreMBB);
26044 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26045 mainMBB->addSuccessor(sinkMBB);
26048 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26049 TII->get(X86::PHI), DstReg)
26050 .addReg(mainDstReg).addMBB(mainMBB)
26051 .addReg(restoreDstReg).addMBB(restoreMBB);
26054 if (RegInfo->hasBasePointer(*MF)) {
26055 const bool Uses64BitFramePtr =
26056 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26057 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26058 X86FI->setRestoreBasePointer(MF);
26059 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26060 unsigned BasePtr = RegInfo->getBaseRegister();
26061 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26062 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26063 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26064 .setMIFlag(MachineInstr::FrameSetup);
26066 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26067 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26068 restoreMBB->addSuccessor(sinkMBB);
26070 MI.eraseFromParent();
26074 MachineBasicBlock *
26075 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26076 MachineBasicBlock *MBB) const {
26077 DebugLoc DL = MI.getDebugLoc();
26078 MachineFunction *MF = MBB->getParent();
26079 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26080 MachineRegisterInfo &MRI = MF->getRegInfo();
26082 // Memory Reference
26083 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26084 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26086 MVT PVT = getPointerTy(MF->getDataLayout());
26087 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26088 "Invalid Pointer Size!");
26090 const TargetRegisterClass *RC =
26091 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26092 unsigned Tmp = MRI.createVirtualRegister(RC);
26093 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26094 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26095 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26096 unsigned SP = RegInfo->getStackRegister();
26098 MachineInstrBuilder MIB;
26100 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26101 const int64_t SPOffset = 2 * PVT.getStoreSize();
26103 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26104 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26107 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26108 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26109 MIB.add(MI.getOperand(i));
26110 MIB.setMemRefs(MMOBegin, MMOEnd);
26112 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26113 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26114 if (i == X86::AddrDisp)
26115 MIB.addDisp(MI.getOperand(i), LabelOffset);
26117 MIB.add(MI.getOperand(i));
26119 MIB.setMemRefs(MMOBegin, MMOEnd);
26121 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26122 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26123 if (i == X86::AddrDisp)
26124 MIB.addDisp(MI.getOperand(i), SPOffset);
26126 MIB.add(MI.getOperand(i));
26128 MIB.setMemRefs(MMOBegin, MMOEnd);
26130 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26132 MI.eraseFromParent();
26136 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26137 MachineBasicBlock *MBB,
26138 MachineBasicBlock *DispatchBB,
26140 DebugLoc DL = MI.getDebugLoc();
26141 MachineFunction *MF = MBB->getParent();
26142 MachineRegisterInfo *MRI = &MF->getRegInfo();
26143 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26145 MVT PVT = getPointerTy(MF->getDataLayout());
26146 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26151 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26152 !isPositionIndependent();
26155 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26157 const TargetRegisterClass *TRC =
26158 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26159 VR = MRI->createVirtualRegister(TRC);
26160 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26162 if (Subtarget.is64Bit())
26163 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26167 .addMBB(DispatchBB)
26170 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26171 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26174 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26178 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26179 addFrameReference(MIB, FI, 36);
26181 MIB.addMBB(DispatchBB);
26186 MachineBasicBlock *
26187 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26188 MachineBasicBlock *BB) const {
26189 DebugLoc DL = MI.getDebugLoc();
26190 MachineFunction *MF = BB->getParent();
26191 MachineFrameInfo &MFI = MF->getFrameInfo();
26192 MachineRegisterInfo *MRI = &MF->getRegInfo();
26193 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26194 int FI = MFI.getFunctionContextIndex();
26196 // Get a mapping of the call site numbers to all of the landing pads they're
26197 // associated with.
26198 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26199 unsigned MaxCSNum = 0;
26200 for (auto &MBB : *MF) {
26201 if (!MBB.isEHPad())
26204 MCSymbol *Sym = nullptr;
26205 for (const auto &MI : MBB) {
26206 if (MI.isDebugValue())
26209 assert(MI.isEHLabel() && "expected EH_LABEL");
26210 Sym = MI.getOperand(0).getMCSymbol();
26214 if (!MF->hasCallSiteLandingPad(Sym))
26217 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26218 CallSiteNumToLPad[CSI].push_back(&MBB);
26219 MaxCSNum = std::max(MaxCSNum, CSI);
26223 // Get an ordered list of the machine basic blocks for the jump table.
26224 std::vector<MachineBasicBlock *> LPadList;
26225 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26226 LPadList.reserve(CallSiteNumToLPad.size());
26228 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26229 for (auto &LP : CallSiteNumToLPad[CSI]) {
26230 LPadList.push_back(LP);
26231 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26235 assert(!LPadList.empty() &&
26236 "No landing pad destinations for the dispatch jump table!");
26238 // Create the MBBs for the dispatch code.
26240 // Shove the dispatch's address into the return slot in the function context.
26241 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26242 DispatchBB->setIsEHPad(true);
26244 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26245 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26246 DispatchBB->addSuccessor(TrapBB);
26248 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26249 DispatchBB->addSuccessor(DispContBB);
26252 MF->push_back(DispatchBB);
26253 MF->push_back(DispContBB);
26254 MF->push_back(TrapBB);
26256 // Insert code into the entry block that creates and registers the function
26258 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26260 // Create the jump table and associated information
26261 MachineJumpTableInfo *JTI =
26262 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26263 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26265 const X86RegisterInfo &RI = TII->getRegisterInfo();
26266 // Add a register mask with no preserved registers. This results in all
26267 // registers being marked as clobbered.
26268 if (RI.hasBasePointer(*MF)) {
26269 const bool FPIs64Bit =
26270 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26271 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26272 MFI->setRestoreBasePointer(MF);
26274 unsigned FP = RI.getFrameRegister(*MF);
26275 unsigned BP = RI.getBaseRegister();
26276 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26277 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26278 MFI->getRestoreBasePointerOffset())
26279 .addRegMask(RI.getNoPreservedMask());
26281 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26282 .addRegMask(RI.getNoPreservedMask());
26285 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26286 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26288 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26290 .addImm(LPadList.size());
26291 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26293 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26294 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26297 BuildMI(DispContBB, DL,
26298 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26300 .addImm(Subtarget.is64Bit() ? 8 : 4)
26302 .addJumpTableIndex(MJTI)
26305 // Add the jump table entries as successors to the MBB.
26306 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26307 for (auto &LP : LPadList)
26308 if (SeenMBBs.insert(LP).second)
26309 DispContBB->addSuccessor(LP);
26311 // N.B. the order the invoke BBs are processed in doesn't matter here.
26312 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26313 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26314 for (MachineBasicBlock *MBB : InvokeBBs) {
26315 // Remove the landing pad successor from the invoke block and replace it
26316 // with the new dispatch block.
26317 // Keep a copy of Successors since it's modified inside the loop.
26318 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26320 // FIXME: Avoid quadratic complexity.
26321 for (auto MBBS : Successors) {
26322 if (MBBS->isEHPad()) {
26323 MBB->removeSuccessor(MBBS);
26324 MBBLPads.push_back(MBBS);
26328 MBB->addSuccessor(DispatchBB);
26330 // Find the invoke call and mark all of the callee-saved registers as
26331 // 'implicit defined' so that they're spilled. This prevents code from
26332 // moving instructions to before the EH block, where they will never be
26334 for (auto &II : reverse(*MBB)) {
26338 DenseMap<unsigned, bool> DefRegs;
26339 for (auto &MOp : II.operands())
26341 DefRegs[MOp.getReg()] = true;
26343 MachineInstrBuilder MIB(*MF, &II);
26344 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26345 unsigned Reg = SavedRegs[RI];
26347 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26354 // Mark all former landing pads as non-landing pads. The dispatch is the only
26355 // landing pad now.
26356 for (auto &LP : MBBLPads)
26357 LP->setIsEHPad(false);
26359 // The instruction is gone now.
26360 MI.eraseFromParent();
26364 MachineBasicBlock *
26365 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26366 MachineBasicBlock *BB) const {
26367 MachineFunction *MF = BB->getParent();
26368 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26369 DebugLoc DL = MI.getDebugLoc();
26371 switch (MI.getOpcode()) {
26372 default: llvm_unreachable("Unexpected instr type to insert");
26373 case X86::TAILJMPd64:
26374 case X86::TAILJMPr64:
26375 case X86::TAILJMPm64:
26376 case X86::TAILJMPr64_REX:
26377 case X86::TAILJMPm64_REX:
26378 llvm_unreachable("TAILJMP64 would not be touched here.");
26379 case X86::TCRETURNdi64:
26380 case X86::TCRETURNri64:
26381 case X86::TCRETURNmi64:
26383 case X86::TLS_addr32:
26384 case X86::TLS_addr64:
26385 case X86::TLS_base_addr32:
26386 case X86::TLS_base_addr64:
26387 return EmitLoweredTLSAddr(MI, BB);
26388 case X86::CATCHRET:
26389 return EmitLoweredCatchRet(MI, BB);
26390 case X86::CATCHPAD:
26391 return EmitLoweredCatchPad(MI, BB);
26392 case X86::SEG_ALLOCA_32:
26393 case X86::SEG_ALLOCA_64:
26394 return EmitLoweredSegAlloca(MI, BB);
26395 case X86::TLSCall_32:
26396 case X86::TLSCall_64:
26397 return EmitLoweredTLSCall(MI, BB);
26398 case X86::CMOV_FR32:
26399 case X86::CMOV_FR64:
26400 case X86::CMOV_FR128:
26401 case X86::CMOV_GR8:
26402 case X86::CMOV_GR16:
26403 case X86::CMOV_GR32:
26404 case X86::CMOV_RFP32:
26405 case X86::CMOV_RFP64:
26406 case X86::CMOV_RFP80:
26407 case X86::CMOV_V2F64:
26408 case X86::CMOV_V2I64:
26409 case X86::CMOV_V4F32:
26410 case X86::CMOV_V4F64:
26411 case X86::CMOV_V4I64:
26412 case X86::CMOV_V16F32:
26413 case X86::CMOV_V8F32:
26414 case X86::CMOV_V8F64:
26415 case X86::CMOV_V8I64:
26416 case X86::CMOV_V8I1:
26417 case X86::CMOV_V16I1:
26418 case X86::CMOV_V32I1:
26419 case X86::CMOV_V64I1:
26420 return EmitLoweredSelect(MI, BB);
26422 case X86::RDFLAGS32:
26423 case X86::RDFLAGS64: {
26425 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26426 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26427 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26428 // Permit reads of the FLAGS register without it being defined.
26429 // This intrinsic exists to read external processor state in flags, such as
26430 // the trap flag, interrupt flag, and direction flag, none of which are
26431 // modeled by the backend.
26432 Push->getOperand(2).setIsUndef();
26433 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26435 MI.eraseFromParent(); // The pseudo is gone now.
26439 case X86::WRFLAGS32:
26440 case X86::WRFLAGS64: {
26442 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26444 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26445 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26446 BuildMI(*BB, MI, DL, TII->get(PopF));
26448 MI.eraseFromParent(); // The pseudo is gone now.
26452 case X86::RELEASE_FADD32mr:
26453 case X86::RELEASE_FADD64mr:
26454 return EmitLoweredAtomicFP(MI, BB);
26456 case X86::FP32_TO_INT16_IN_MEM:
26457 case X86::FP32_TO_INT32_IN_MEM:
26458 case X86::FP32_TO_INT64_IN_MEM:
26459 case X86::FP64_TO_INT16_IN_MEM:
26460 case X86::FP64_TO_INT32_IN_MEM:
26461 case X86::FP64_TO_INT64_IN_MEM:
26462 case X86::FP80_TO_INT16_IN_MEM:
26463 case X86::FP80_TO_INT32_IN_MEM:
26464 case X86::FP80_TO_INT64_IN_MEM: {
26465 // Change the floating point control register to use "round towards zero"
26466 // mode when truncating to an integer value.
26467 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26468 addFrameReference(BuildMI(*BB, MI, DL,
26469 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26471 // Load the old value of the high byte of the control word...
26473 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26474 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26477 // Set the high part to be round to zero...
26478 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26481 // Reload the modified control word now...
26482 addFrameReference(BuildMI(*BB, MI, DL,
26483 TII->get(X86::FLDCW16m)), CWFrameIdx);
26485 // Restore the memory image of control word to original value
26486 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26489 // Get the X86 opcode to use.
26491 switch (MI.getOpcode()) {
26492 default: llvm_unreachable("illegal opcode!");
26493 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26494 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26495 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26496 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26497 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26498 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26499 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26500 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26501 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26504 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26505 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26506 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26508 // Reload the original control word now.
26509 addFrameReference(BuildMI(*BB, MI, DL,
26510 TII->get(X86::FLDCW16m)), CWFrameIdx);
26512 MI.eraseFromParent(); // The pseudo instruction is gone now.
26515 // String/text processing lowering.
26516 case X86::PCMPISTRM128REG:
26517 case X86::VPCMPISTRM128REG:
26518 case X86::PCMPISTRM128MEM:
26519 case X86::VPCMPISTRM128MEM:
26520 case X86::PCMPESTRM128REG:
26521 case X86::VPCMPESTRM128REG:
26522 case X86::PCMPESTRM128MEM:
26523 case X86::VPCMPESTRM128MEM:
26524 assert(Subtarget.hasSSE42() &&
26525 "Target must have SSE4.2 or AVX features enabled");
26526 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26528 // String/text processing lowering.
26529 case X86::PCMPISTRIREG:
26530 case X86::VPCMPISTRIREG:
26531 case X86::PCMPISTRIMEM:
26532 case X86::VPCMPISTRIMEM:
26533 case X86::PCMPESTRIREG:
26534 case X86::VPCMPESTRIREG:
26535 case X86::PCMPESTRIMEM:
26536 case X86::VPCMPESTRIMEM:
26537 assert(Subtarget.hasSSE42() &&
26538 "Target must have SSE4.2 or AVX features enabled");
26539 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26541 // Thread synchronization.
26543 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26544 case X86::MONITORX:
26545 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26549 return emitClzero(&MI, BB, Subtarget);
26553 return emitWRPKRU(MI, BB, Subtarget);
26555 return emitRDPKRU(MI, BB, Subtarget);
26558 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26560 case X86::VASTART_SAVE_XMM_REGS:
26561 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26563 case X86::VAARG_64:
26564 return EmitVAARG64WithCustomInserter(MI, BB);
26566 case X86::EH_SjLj_SetJmp32:
26567 case X86::EH_SjLj_SetJmp64:
26568 return emitEHSjLjSetJmp(MI, BB);
26570 case X86::EH_SjLj_LongJmp32:
26571 case X86::EH_SjLj_LongJmp64:
26572 return emitEHSjLjLongJmp(MI, BB);
26574 case X86::Int_eh_sjlj_setup_dispatch:
26575 return EmitSjLjDispatchBlock(MI, BB);
26577 case TargetOpcode::STATEPOINT:
26578 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26579 // this point in the process. We diverge later.
26580 return emitPatchPoint(MI, BB);
26582 case TargetOpcode::STACKMAP:
26583 case TargetOpcode::PATCHPOINT:
26584 return emitPatchPoint(MI, BB);
26586 case TargetOpcode::PATCHABLE_EVENT_CALL:
26587 // Do nothing here, handle in xray instrumentation pass.
26590 case X86::LCMPXCHG8B: {
26591 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26592 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26593 // requires a memory operand. If it happens that current architecture is
26594 // i686 and for current function we need a base pointer
26595 // - which is ESI for i686 - register allocator would not be able to
26596 // allocate registers for an address in form of X(%reg, %reg, Y)
26597 // - there never would be enough unreserved registers during regalloc
26598 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26599 // We are giving a hand to register allocator by precomputing the address in
26600 // a new vreg using LEA.
26602 // If it is not i686 or there is no base pointer - nothing to do here.
26603 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26606 // Even though this code does not necessarily needs the base pointer to
26607 // be ESI, we check for that. The reason: if this assert fails, there are
26608 // some changes happened in the compiler base pointer handling, which most
26609 // probably have to be addressed somehow here.
26610 assert(TRI->getBaseRegister() == X86::ESI &&
26611 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26612 "base pointer in mind");
26614 MachineRegisterInfo &MRI = MF->getRegInfo();
26615 MVT SPTy = getPointerTy(MF->getDataLayout());
26616 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26617 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26619 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26620 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26621 // does not use index register.
26622 if (AM.IndexReg == X86::NoRegister)
26625 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26626 // four operand definitions that are E[ABCD] registers. We skip them and
26627 // then insert the LEA.
26628 MachineBasicBlock::iterator MBBI(MI);
26629 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26630 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26633 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26635 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26639 case X86::LCMPXCHG16B:
26641 case X86::LCMPXCHG8B_SAVE_EBX:
26642 case X86::LCMPXCHG16B_SAVE_RBX: {
26644 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26645 if (!BB->isLiveIn(BasePtr))
26646 BB->addLiveIn(BasePtr);
26652 //===----------------------------------------------------------------------===//
26653 // X86 Optimization Hooks
26654 //===----------------------------------------------------------------------===//
26656 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26658 const APInt &DemandedElts,
26659 const SelectionDAG &DAG,
26660 unsigned Depth) const {
26661 unsigned BitWidth = Known.getBitWidth();
26662 unsigned Opc = Op.getOpcode();
26663 EVT VT = Op.getValueType();
26664 assert((Opc >= ISD::BUILTIN_OP_END ||
26665 Opc == ISD::INTRINSIC_WO_CHAIN ||
26666 Opc == ISD::INTRINSIC_W_CHAIN ||
26667 Opc == ISD::INTRINSIC_VOID) &&
26668 "Should use MaskedValueIsZero if you don't know whether Op"
26669 " is a target node!");
26685 // These nodes' second result is a boolean.
26686 if (Op.getResNo() == 0)
26689 case X86ISD::SETCC:
26690 Known.Zero.setBitsFrom(1);
26692 case X86ISD::MOVMSK: {
26693 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26694 Known.Zero.setBitsFrom(NumLoBits);
26697 case X86ISD::VSHLI:
26698 case X86ISD::VSRLI: {
26699 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26700 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26701 Known.setAllZero();
26705 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26706 unsigned ShAmt = ShiftImm->getZExtValue();
26707 if (Opc == X86ISD::VSHLI) {
26708 Known.Zero <<= ShAmt;
26709 Known.One <<= ShAmt;
26710 // Low bits are known zero.
26711 Known.Zero.setLowBits(ShAmt);
26713 Known.Zero.lshrInPlace(ShAmt);
26714 Known.One.lshrInPlace(ShAmt);
26715 // High bits are known zero.
26716 Known.Zero.setHighBits(ShAmt);
26721 case X86ISD::VZEXT: {
26722 SDValue N0 = Op.getOperand(0);
26723 unsigned NumElts = VT.getVectorNumElements();
26725 EVT SrcVT = N0.getValueType();
26726 unsigned InNumElts = SrcVT.getVectorNumElements();
26727 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26728 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26730 Known = KnownBits(InBitWidth);
26731 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26732 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26733 Known = Known.zext(BitWidth);
26734 Known.Zero.setBitsFrom(InBitWidth);
26740 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26741 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26742 unsigned Depth) const {
26743 unsigned VTBits = Op.getScalarValueSizeInBits();
26744 unsigned Opcode = Op.getOpcode();
26746 case X86ISD::SETCC_CARRY:
26747 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26750 case X86ISD::VSEXT: {
26751 SDValue Src = Op.getOperand(0);
26752 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26753 Tmp += VTBits - Src.getScalarValueSizeInBits();
26757 case X86ISD::VSRAI: {
26758 SDValue Src = Op.getOperand(0);
26759 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26760 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26762 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26765 case X86ISD::PCMPGT:
26766 case X86ISD::PCMPEQ:
26768 case X86ISD::VPCOM:
26769 case X86ISD::VPCOMU:
26770 // Vector compares return zero/all-bits result values.
26778 /// Returns true (and the GlobalValue and the offset) if the node is a
26779 /// GlobalAddress + offset.
26780 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26781 const GlobalValue* &GA,
26782 int64_t &Offset) const {
26783 if (N->getOpcode() == X86ISD::Wrapper) {
26784 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26785 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26786 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26790 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26793 // Attempt to match a combined shuffle mask against supported unary shuffle
26795 // TODO: Investigate sharing more of this with shuffle lowering.
26796 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26797 bool AllowFloatDomain, bool AllowIntDomain,
26798 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26799 const X86Subtarget &Subtarget,
26800 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26801 unsigned NumMaskElts = Mask.size();
26802 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26804 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26805 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26806 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26807 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26808 unsigned MaxScale = 64 / MaskEltSize;
26809 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26811 unsigned NumDstElts = NumMaskElts / Scale;
26812 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26813 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26814 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26817 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26818 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26819 if (SrcVT != MaskVT)
26820 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26821 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26822 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26823 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26824 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26830 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26831 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26832 isUndefOrEqual(Mask[0], 0) &&
26833 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26834 Shuffle = X86ISD::VZEXT_MOVL;
26835 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26839 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26840 // instructions are no slower than UNPCKLPD but has the option to
26841 // fold the input operand into even an unaligned memory load.
26842 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26843 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26844 Shuffle = X86ISD::MOVDDUP;
26845 SrcVT = DstVT = MVT::v2f64;
26848 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26849 Shuffle = X86ISD::MOVSLDUP;
26850 SrcVT = DstVT = MVT::v4f32;
26853 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26854 Shuffle = X86ISD::MOVSHDUP;
26855 SrcVT = DstVT = MVT::v4f32;
26860 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26861 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26862 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26863 Shuffle = X86ISD::MOVDDUP;
26864 SrcVT = DstVT = MVT::v4f64;
26867 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26868 Shuffle = X86ISD::MOVSLDUP;
26869 SrcVT = DstVT = MVT::v8f32;
26872 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26873 Shuffle = X86ISD::MOVSHDUP;
26874 SrcVT = DstVT = MVT::v8f32;
26879 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26880 assert(Subtarget.hasAVX512() &&
26881 "AVX512 required for 512-bit vector shuffles");
26882 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26883 Shuffle = X86ISD::MOVDDUP;
26884 SrcVT = DstVT = MVT::v8f64;
26887 if (isTargetShuffleEquivalent(
26888 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26889 Shuffle = X86ISD::MOVSLDUP;
26890 SrcVT = DstVT = MVT::v16f32;
26893 if (isTargetShuffleEquivalent(
26894 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26895 Shuffle = X86ISD::MOVSHDUP;
26896 SrcVT = DstVT = MVT::v16f32;
26901 // Attempt to match against broadcast-from-vector.
26902 if (Subtarget.hasAVX2()) {
26903 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26904 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26905 SrcVT = DstVT = MaskVT;
26906 Shuffle = X86ISD::VBROADCAST;
26914 // Attempt to match a combined shuffle mask against supported unary immediate
26915 // permute instructions.
26916 // TODO: Investigate sharing more of this with shuffle lowering.
26917 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26918 bool AllowFloatDomain,
26919 bool AllowIntDomain,
26920 const X86Subtarget &Subtarget,
26921 unsigned &Shuffle, MVT &ShuffleVT,
26922 unsigned &PermuteImm) {
26923 unsigned NumMaskElts = Mask.size();
26925 bool ContainsZeros = false;
26926 APInt Zeroable(NumMaskElts, false);
26927 for (unsigned i = 0; i != NumMaskElts; ++i) {
26929 if (isUndefOrZero(M))
26930 Zeroable.setBit(i);
26931 ContainsZeros |= (M == SM_SentinelZero);
26934 // Attempt to match against byte/bit shifts.
26935 // FIXME: Add 512-bit support.
26936 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26937 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26938 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26939 MaskVT.getScalarSizeInBits(), Mask,
26940 0, Zeroable, Subtarget);
26941 if (0 < ShiftAmt) {
26942 PermuteImm = (unsigned)ShiftAmt;
26947 // Ensure we don't contain any zero elements.
26951 assert(llvm::all_of(Mask, [&](int M) {
26952 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26953 }) && "Expected unary shuffle");
26955 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26956 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26957 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26959 // Handle PSHUFLW/PSHUFHW repeated patterns.
26960 if (MaskScalarSizeInBits == 16) {
26961 SmallVector<int, 4> RepeatedMask;
26962 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26963 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26964 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26966 // PSHUFLW: permute lower 4 elements only.
26967 if (isUndefOrInRange(LoMask, 0, 4) &&
26968 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26969 Shuffle = X86ISD::PSHUFLW;
26970 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26971 PermuteImm = getV4X86ShuffleImm(LoMask);
26975 // PSHUFHW: permute upper 4 elements only.
26976 if (isUndefOrInRange(HiMask, 4, 8) &&
26977 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26978 // Offset the HiMask so that we can create the shuffle immediate.
26979 int OffsetHiMask[4];
26980 for (int i = 0; i != 4; ++i)
26981 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26983 Shuffle = X86ISD::PSHUFHW;
26984 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26985 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26994 // We only support permutation of 32/64 bit elements after this.
26995 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26998 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26999 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27000 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
27003 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
27004 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
27005 AllowFloatDomain = true;
27006 AllowIntDomain = false;
27009 // Check for lane crossing permutes.
27010 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27011 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27012 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
27013 Shuffle = X86ISD::VPERMI;
27014 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27015 PermuteImm = getV4X86ShuffleImm(Mask);
27018 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
27019 SmallVector<int, 4> RepeatedMask;
27020 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27021 Shuffle = X86ISD::VPERMI;
27022 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27023 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27030 // VPERMILPD can permute with a non-repeating shuffle.
27031 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27032 Shuffle = X86ISD::VPERMILPI;
27033 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27035 for (int i = 0, e = Mask.size(); i != e; ++i) {
27037 if (M == SM_SentinelUndef)
27039 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27040 PermuteImm |= (M & 1) << i;
27045 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27046 SmallVector<int, 4> RepeatedMask;
27047 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27050 // Narrow the repeated mask for 32-bit element permutes.
27051 SmallVector<int, 4> WordMask = RepeatedMask;
27052 if (MaskScalarSizeInBits == 64)
27053 scaleShuffleMask(2, RepeatedMask, WordMask);
27055 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27056 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27057 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27058 PermuteImm = getV4X86ShuffleImm(WordMask);
27062 // Attempt to match a combined unary shuffle mask against supported binary
27063 // shuffle instructions.
27064 // TODO: Investigate sharing more of this with shuffle lowering.
27065 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27066 bool AllowFloatDomain, bool AllowIntDomain,
27067 SDValue &V1, SDValue &V2, SDLoc &DL,
27069 const X86Subtarget &Subtarget,
27070 unsigned &Shuffle, MVT &ShuffleVT,
27072 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27074 if (MaskVT.is128BitVector()) {
27075 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27077 Shuffle = X86ISD::MOVLHPS;
27078 ShuffleVT = MVT::v4f32;
27081 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27083 Shuffle = X86ISD::MOVHLPS;
27084 ShuffleVT = MVT::v4f32;
27087 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27088 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27090 Shuffle = X86ISD::MOVSD;
27091 ShuffleVT = MaskVT;
27094 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27095 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27096 Shuffle = X86ISD::MOVSS;
27097 ShuffleVT = MaskVT;
27102 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27103 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27104 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27105 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27106 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27107 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27108 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27110 ShuffleVT = MaskVT;
27111 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27112 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27120 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27121 bool AllowFloatDomain,
27122 bool AllowIntDomain,
27123 SDValue &V1, SDValue &V2, SDLoc &DL,
27125 const X86Subtarget &Subtarget,
27126 unsigned &Shuffle, MVT &ShuffleVT,
27127 unsigned &PermuteImm) {
27128 unsigned NumMaskElts = Mask.size();
27129 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27131 // Attempt to match against PALIGNR byte rotate.
27132 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27133 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27134 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27135 if (0 < ByteRotation) {
27136 Shuffle = X86ISD::PALIGNR;
27137 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27138 PermuteImm = ByteRotation;
27143 // Attempt to combine to X86ISD::BLENDI.
27144 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27145 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27146 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27147 uint64_t BlendMask = 0;
27148 bool ForceV1Zero = false, ForceV2Zero = false;
27149 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27150 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27152 if (MaskVT == MVT::v16i16) {
27153 // We can only use v16i16 PBLENDW if the lanes are repeated.
27154 SmallVector<int, 8> RepeatedMask;
27155 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27157 assert(RepeatedMask.size() == 8 &&
27158 "Repeated mask size doesn't match!");
27160 for (int i = 0; i < 8; ++i)
27161 if (RepeatedMask[i] >= 8)
27162 PermuteImm |= 1 << i;
27163 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27164 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27165 Shuffle = X86ISD::BLENDI;
27166 ShuffleVT = MaskVT;
27170 // Determine a type compatible with X86ISD::BLENDI.
27171 ShuffleVT = MaskVT;
27172 if (Subtarget.hasAVX2()) {
27173 if (ShuffleVT == MVT::v4i64)
27174 ShuffleVT = MVT::v8i32;
27175 else if (ShuffleVT == MVT::v2i64)
27176 ShuffleVT = MVT::v4i32;
27178 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27179 ShuffleVT = MVT::v8i16;
27180 else if (ShuffleVT == MVT::v4i64)
27181 ShuffleVT = MVT::v4f64;
27182 else if (ShuffleVT == MVT::v8i32)
27183 ShuffleVT = MVT::v8f32;
27186 if (!ShuffleVT.isFloatingPoint()) {
27187 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27189 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27190 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27191 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27194 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27195 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27196 PermuteImm = (unsigned)BlendMask;
27197 Shuffle = X86ISD::BLENDI;
27203 // Attempt to combine to INSERTPS.
27204 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27205 MaskVT.is128BitVector()) {
27206 APInt Zeroable(4, 0);
27207 for (unsigned i = 0; i != NumMaskElts; ++i)
27209 Zeroable.setBit(i);
27211 if (Zeroable.getBoolValue() &&
27212 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27213 Shuffle = X86ISD::INSERTPS;
27214 ShuffleVT = MVT::v4f32;
27219 // Attempt to combine to SHUFPD.
27220 if (AllowFloatDomain && EltSizeInBits == 64 &&
27221 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27222 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27223 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27224 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27225 Shuffle = X86ISD::SHUFP;
27226 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27231 // Attempt to combine to SHUFPS.
27232 if (AllowFloatDomain && EltSizeInBits == 32 &&
27233 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27234 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27235 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27236 SmallVector<int, 4> RepeatedMask;
27237 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27238 // Match each half of the repeated mask, to determine if its just
27239 // referencing one of the vectors, is zeroable or entirely undef.
27240 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27241 int M0 = RepeatedMask[Offset];
27242 int M1 = RepeatedMask[Offset + 1];
27244 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27245 return DAG.getUNDEF(MaskVT);
27246 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27247 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27248 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27249 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27250 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27251 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27252 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27254 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27255 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27256 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27263 int ShufMask[4] = {-1, -1, -1, -1};
27264 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27265 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27270 Shuffle = X86ISD::SHUFP;
27271 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27272 PermuteImm = getV4X86ShuffleImm(ShufMask);
27281 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27284 /// This is the leaf of the recursive combine below. When we have found some
27285 /// chain of single-use x86 shuffle instructions and accumulated the combined
27286 /// shuffle mask represented by them, this will try to pattern match that mask
27287 /// into either a single instruction if there is a special purpose instruction
27288 /// for this operation, or into a PSHUFB instruction which is a fully general
27289 /// instruction but should only be used to replace chains over a certain depth.
27290 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27291 ArrayRef<int> BaseMask, int Depth,
27292 bool HasVariableMask, SelectionDAG &DAG,
27293 TargetLowering::DAGCombinerInfo &DCI,
27294 const X86Subtarget &Subtarget) {
27295 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27296 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27297 "Unexpected number of shuffle inputs!");
27299 // Find the inputs that enter the chain. Note that multiple uses are OK
27300 // here, we're not going to remove the operands we find.
27301 bool UnaryShuffle = (Inputs.size() == 1);
27302 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27303 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27304 : peekThroughBitcasts(Inputs[1]));
27306 MVT VT1 = V1.getSimpleValueType();
27307 MVT VT2 = V2.getSimpleValueType();
27308 MVT RootVT = Root.getSimpleValueType();
27309 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27310 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27311 "Vector size mismatch");
27316 unsigned NumBaseMaskElts = BaseMask.size();
27317 if (NumBaseMaskElts == 1) {
27318 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27319 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27324 unsigned RootSizeInBits = RootVT.getSizeInBits();
27325 unsigned NumRootElts = RootVT.getVectorNumElements();
27326 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27327 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27328 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27330 // Don't combine if we are a AVX512/EVEX target and the mask element size
27331 // is different from the root element size - this would prevent writemasks
27332 // from being reused.
27333 // TODO - this currently prevents all lane shuffles from occurring.
27334 // TODO - check for writemasks usage instead of always preventing combining.
27335 // TODO - attempt to narrow Mask back to writemask size.
27336 bool IsEVEXShuffle =
27337 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27338 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27341 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27343 // Handle 128-bit lane shuffles of 256-bit vectors.
27344 // TODO - this should support binary shuffles.
27345 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27346 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27347 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27348 return false; // Nothing to do!
27349 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27350 unsigned PermMask = 0;
27351 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27352 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27354 Res = DAG.getBitcast(ShuffleVT, V1);
27355 DCI.AddToWorklist(Res.getNode());
27356 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27357 DAG.getUNDEF(ShuffleVT),
27358 DAG.getConstant(PermMask, DL, MVT::i8));
27359 DCI.AddToWorklist(Res.getNode());
27360 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27365 // For masks that have been widened to 128-bit elements or more,
27366 // narrow back down to 64-bit elements.
27367 SmallVector<int, 64> Mask;
27368 if (BaseMaskEltSizeInBits > 64) {
27369 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27370 int MaskScale = BaseMaskEltSizeInBits / 64;
27371 scaleShuffleMask(MaskScale, BaseMask, Mask);
27373 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27376 unsigned NumMaskElts = Mask.size();
27377 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27379 // Determine the effective mask value type.
27380 FloatDomain &= (32 <= MaskEltSizeInBits);
27381 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27382 : MVT::getIntegerVT(MaskEltSizeInBits);
27383 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27385 // Only allow legal mask types.
27386 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27389 // Attempt to match the mask against known shuffle patterns.
27390 MVT ShuffleSrcVT, ShuffleVT;
27391 unsigned Shuffle, PermuteImm;
27393 // Which shuffle domains are permitted?
27394 // Permit domain crossing at higher combine depths.
27395 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27396 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27398 if (UnaryShuffle) {
27399 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27400 // directly if we don't shuffle the lower element and we shuffle the upper
27401 // (zero) elements within themselves.
27402 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27403 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27404 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27405 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27406 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27407 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27408 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27414 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27415 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27417 if (Depth == 1 && Root.getOpcode() == Shuffle)
27418 return false; // Nothing to do!
27419 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27420 return false; // AVX512 Writemask clash.
27421 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27422 DCI.AddToWorklist(Res.getNode());
27423 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27424 DCI.AddToWorklist(Res.getNode());
27425 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27430 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27431 AllowIntDomain, Subtarget, Shuffle,
27432 ShuffleVT, PermuteImm)) {
27433 if (Depth == 1 && Root.getOpcode() == Shuffle)
27434 return false; // Nothing to do!
27435 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27436 return false; // AVX512 Writemask clash.
27437 Res = DAG.getBitcast(ShuffleVT, V1);
27438 DCI.AddToWorklist(Res.getNode());
27439 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27440 DAG.getConstant(PermuteImm, DL, MVT::i8));
27441 DCI.AddToWorklist(Res.getNode());
27442 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27448 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27449 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27451 if (Depth == 1 && Root.getOpcode() == Shuffle)
27452 return false; // Nothing to do!
27453 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27454 return false; // AVX512 Writemask clash.
27455 V1 = DAG.getBitcast(ShuffleVT, V1);
27456 DCI.AddToWorklist(V1.getNode());
27457 V2 = DAG.getBitcast(ShuffleVT, V2);
27458 DCI.AddToWorklist(V2.getNode());
27459 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27460 DCI.AddToWorklist(Res.getNode());
27461 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27466 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27467 AllowIntDomain, V1, V2, DL, DAG,
27468 Subtarget, Shuffle, ShuffleVT,
27470 if (Depth == 1 && Root.getOpcode() == Shuffle)
27471 return false; // Nothing to do!
27472 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27473 return false; // AVX512 Writemask clash.
27474 V1 = DAG.getBitcast(ShuffleVT, V1);
27475 DCI.AddToWorklist(V1.getNode());
27476 V2 = DAG.getBitcast(ShuffleVT, V2);
27477 DCI.AddToWorklist(V2.getNode());
27478 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27479 DAG.getConstant(PermuteImm, DL, MVT::i8));
27480 DCI.AddToWorklist(Res.getNode());
27481 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27486 // Don't try to re-form single instruction chains under any circumstances now
27487 // that we've done encoding canonicalization for them.
27491 bool MaskContainsZeros =
27492 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27494 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27495 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27496 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27497 ((Subtarget.hasAVX2() &&
27498 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27499 (Subtarget.hasAVX512() &&
27500 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27501 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27502 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27503 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27504 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27505 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27506 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27507 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27508 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27509 DCI.AddToWorklist(VPermMask.getNode());
27510 Res = DAG.getBitcast(MaskVT, V1);
27511 DCI.AddToWorklist(Res.getNode());
27512 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27513 DCI.AddToWorklist(Res.getNode());
27514 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27519 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27520 // vector as the second source.
27521 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27522 ((Subtarget.hasAVX512() &&
27523 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27524 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27525 (Subtarget.hasVLX() &&
27526 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27527 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27528 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27529 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27530 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27531 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27532 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27533 for (unsigned i = 0; i != NumMaskElts; ++i)
27534 if (Mask[i] == SM_SentinelZero)
27535 Mask[i] = NumMaskElts + i;
27537 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27538 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27539 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27540 DCI.AddToWorklist(VPermMask.getNode());
27541 Res = DAG.getBitcast(MaskVT, V1);
27542 DCI.AddToWorklist(Res.getNode());
27543 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27544 DCI.AddToWorklist(Zero.getNode());
27545 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27546 DCI.AddToWorklist(Res.getNode());
27547 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27552 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27553 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27554 ((Subtarget.hasAVX512() &&
27555 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27556 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27557 (Subtarget.hasVLX() &&
27558 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27559 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27560 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27561 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27562 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27563 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27564 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27565 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27566 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27567 DCI.AddToWorklist(VPermMask.getNode());
27568 V1 = DAG.getBitcast(MaskVT, V1);
27569 DCI.AddToWorklist(V1.getNode());
27570 V2 = DAG.getBitcast(MaskVT, V2);
27571 DCI.AddToWorklist(V2.getNode());
27572 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27573 DCI.AddToWorklist(Res.getNode());
27574 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27581 // See if we can combine a single input shuffle with zeros to a bit-mask,
27582 // which is much simpler than any shuffle.
27583 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27584 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27585 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27586 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27587 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27588 APInt UndefElts(NumMaskElts, 0);
27589 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27590 for (unsigned i = 0; i != NumMaskElts; ++i) {
27592 if (M == SM_SentinelUndef) {
27593 UndefElts.setBit(i);
27596 if (M == SM_SentinelZero)
27598 EltBits[i] = AllOnes;
27600 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27601 DCI.AddToWorklist(BitMask.getNode());
27602 Res = DAG.getBitcast(MaskVT, V1);
27603 DCI.AddToWorklist(Res.getNode());
27604 unsigned AndOpcode =
27605 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27606 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27607 DCI.AddToWorklist(Res.getNode());
27608 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27613 // If we have a single input shuffle with different shuffle patterns in the
27614 // the 128-bit lanes use the variable mask to VPERMILPS.
27615 // TODO Combine other mask types at higher depths.
27616 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27617 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27618 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27619 SmallVector<SDValue, 16> VPermIdx;
27620 for (int M : Mask) {
27622 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27623 VPermIdx.push_back(Idx);
27625 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27626 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27627 DCI.AddToWorklist(VPermMask.getNode());
27628 Res = DAG.getBitcast(MaskVT, V1);
27629 DCI.AddToWorklist(Res.getNode());
27630 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27631 DCI.AddToWorklist(Res.getNode());
27632 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27637 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27638 // to VPERMIL2PD/VPERMIL2PS.
27639 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27640 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27641 MaskVT == MVT::v8f32)) {
27642 // VPERMIL2 Operation.
27643 // Bits[3] - Match Bit.
27644 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27645 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27646 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27647 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27648 SmallVector<int, 8> VPerm2Idx;
27649 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27650 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27651 unsigned M2ZImm = 0;
27652 for (int M : Mask) {
27653 if (M == SM_SentinelUndef) {
27654 VPerm2Idx.push_back(-1);
27657 if (M == SM_SentinelZero) {
27659 VPerm2Idx.push_back(8);
27662 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27663 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27664 VPerm2Idx.push_back(Index);
27666 V1 = DAG.getBitcast(MaskVT, V1);
27667 DCI.AddToWorklist(V1.getNode());
27668 V2 = DAG.getBitcast(MaskVT, V2);
27669 DCI.AddToWorklist(V2.getNode());
27670 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27671 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27672 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27673 DAG.getConstant(M2ZImm, DL, MVT::i8));
27674 DCI.AddToWorklist(Res.getNode());
27675 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27680 // If we have 3 or more shuffle instructions or a chain involving a variable
27681 // mask, we can replace them with a single PSHUFB instruction profitably.
27682 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27683 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27684 // more aggressive.
27685 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27686 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27687 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27688 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27689 SmallVector<SDValue, 16> PSHUFBMask;
27690 int NumBytes = RootVT.getSizeInBits() / 8;
27691 int Ratio = NumBytes / NumMaskElts;
27692 for (int i = 0; i < NumBytes; ++i) {
27693 int M = Mask[i / Ratio];
27694 if (M == SM_SentinelUndef) {
27695 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27698 if (M == SM_SentinelZero) {
27699 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27702 M = Ratio * M + i % Ratio;
27703 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27704 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27706 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27707 Res = DAG.getBitcast(ByteVT, V1);
27708 DCI.AddToWorklist(Res.getNode());
27709 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27710 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27711 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27712 DCI.AddToWorklist(Res.getNode());
27713 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27718 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27719 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27720 // slower than PSHUFB on targets that support both.
27721 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27722 Subtarget.hasXOP()) {
27723 // VPPERM Mask Operation
27724 // Bits[4:0] - Byte Index (0 - 31)
27725 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27726 SmallVector<SDValue, 16> VPPERMMask;
27728 int Ratio = NumBytes / NumMaskElts;
27729 for (int i = 0; i < NumBytes; ++i) {
27730 int M = Mask[i / Ratio];
27731 if (M == SM_SentinelUndef) {
27732 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27735 if (M == SM_SentinelZero) {
27736 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27739 M = Ratio * M + i % Ratio;
27740 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27742 MVT ByteVT = MVT::v16i8;
27743 V1 = DAG.getBitcast(ByteVT, V1);
27744 DCI.AddToWorklist(V1.getNode());
27745 V2 = DAG.getBitcast(ByteVT, V2);
27746 DCI.AddToWorklist(V2.getNode());
27747 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27748 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27749 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27750 DCI.AddToWorklist(Res.getNode());
27751 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27756 // Failed to find any combines.
27760 // Attempt to constant fold all of the constant source ops.
27761 // Returns true if the entire shuffle is folded to a constant.
27762 // TODO: Extend this to merge multiple constant Ops and update the mask.
27763 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27764 ArrayRef<int> Mask, SDValue Root,
27765 bool HasVariableMask, SelectionDAG &DAG,
27766 TargetLowering::DAGCombinerInfo &DCI,
27767 const X86Subtarget &Subtarget) {
27768 MVT VT = Root.getSimpleValueType();
27770 unsigned SizeInBits = VT.getSizeInBits();
27771 unsigned NumMaskElts = Mask.size();
27772 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27773 unsigned NumOps = Ops.size();
27775 // Extract constant bits from each source op.
27776 bool OneUseConstantOp = false;
27777 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27778 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27779 for (unsigned i = 0; i != NumOps; ++i) {
27780 SDValue SrcOp = Ops[i];
27781 OneUseConstantOp |= SrcOp.hasOneUse();
27782 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27787 // Only fold if at least one of the constants is only used once or
27788 // the combined shuffle has included a variable mask shuffle, this
27789 // is to avoid constant pool bloat.
27790 if (!OneUseConstantOp && !HasVariableMask)
27793 // Shuffle the constant bits according to the mask.
27794 APInt UndefElts(NumMaskElts, 0);
27795 APInt ZeroElts(NumMaskElts, 0);
27796 APInt ConstantElts(NumMaskElts, 0);
27797 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27798 APInt::getNullValue(MaskSizeInBits));
27799 for (unsigned i = 0; i != NumMaskElts; ++i) {
27801 if (M == SM_SentinelUndef) {
27802 UndefElts.setBit(i);
27804 } else if (M == SM_SentinelZero) {
27805 ZeroElts.setBit(i);
27808 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27810 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27811 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27813 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27814 if (SrcUndefElts[SrcMaskIdx]) {
27815 UndefElts.setBit(i);
27819 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27820 APInt &Bits = SrcEltBits[SrcMaskIdx];
27822 ZeroElts.setBit(i);
27826 ConstantElts.setBit(i);
27827 ConstantBitData[i] = Bits;
27829 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27831 // Create the constant data.
27833 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27834 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27836 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27838 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27841 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27842 DCI.AddToWorklist(CstOp.getNode());
27843 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27847 /// \brief Fully generic combining of x86 shuffle instructions.
27849 /// This should be the last combine run over the x86 shuffle instructions. Once
27850 /// they have been fully optimized, this will recursively consider all chains
27851 /// of single-use shuffle instructions, build a generic model of the cumulative
27852 /// shuffle operation, and check for simpler instructions which implement this
27853 /// operation. We use this primarily for two purposes:
27855 /// 1) Collapse generic shuffles to specialized single instructions when
27856 /// equivalent. In most cases, this is just an encoding size win, but
27857 /// sometimes we will collapse multiple generic shuffles into a single
27858 /// special-purpose shuffle.
27859 /// 2) Look for sequences of shuffle instructions with 3 or more total
27860 /// instructions, and replace them with the slightly more expensive SSSE3
27861 /// PSHUFB instruction if available. We do this as the last combining step
27862 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27863 /// a suitable short sequence of other instructions. The PSHUFB will either
27864 /// use a register or have to read from memory and so is slightly (but only
27865 /// slightly) more expensive than the other shuffle instructions.
27867 /// Because this is inherently a quadratic operation (for each shuffle in
27868 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27869 /// This should never be an issue in practice as the shuffle lowering doesn't
27870 /// produce sequences of more than 8 instructions.
27872 /// FIXME: We will currently miss some cases where the redundant shuffling
27873 /// would simplify under the threshold for PSHUFB formation because of
27874 /// combine-ordering. To fix this, we should do the redundant instruction
27875 /// combining in this recursive walk.
27876 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27877 int SrcOpIndex, SDValue Root,
27878 ArrayRef<int> RootMask,
27879 ArrayRef<const SDNode*> SrcNodes,
27880 int Depth, bool HasVariableMask,
27882 TargetLowering::DAGCombinerInfo &DCI,
27883 const X86Subtarget &Subtarget) {
27884 // Bound the depth of our recursive combine because this is ultimately
27885 // quadratic in nature.
27889 // Directly rip through bitcasts to find the underlying operand.
27890 SDValue Op = SrcOps[SrcOpIndex];
27891 Op = peekThroughOneUseBitcasts(Op);
27893 MVT VT = Op.getSimpleValueType();
27894 if (!VT.isVector())
27895 return false; // Bail if we hit a non-vector.
27897 assert(Root.getSimpleValueType().isVector() &&
27898 "Shuffles operate on vector types!");
27899 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27900 "Can only combine shuffles of the same vector register size.");
27902 // Extract target shuffle mask and resolve sentinels and inputs.
27903 SmallVector<int, 64> OpMask;
27904 SmallVector<SDValue, 2> OpInputs;
27905 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27908 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27909 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27910 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27912 // Add the inputs to the Ops list, avoiding duplicates.
27913 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27915 int InputIdx0 = -1, InputIdx1 = -1;
27916 for (int i = 0, e = Ops.size(); i < e; ++i) {
27917 SDValue BC = peekThroughBitcasts(Ops[i]);
27918 if (Input0 && BC == peekThroughBitcasts(Input0))
27920 if (Input1 && BC == peekThroughBitcasts(Input1))
27924 if (Input0 && InputIdx0 < 0) {
27925 InputIdx0 = SrcOpIndex;
27926 Ops[SrcOpIndex] = Input0;
27928 if (Input1 && InputIdx1 < 0) {
27929 InputIdx1 = Ops.size();
27930 Ops.push_back(Input1);
27933 assert(((RootMask.size() > OpMask.size() &&
27934 RootMask.size() % OpMask.size() == 0) ||
27935 (OpMask.size() > RootMask.size() &&
27936 OpMask.size() % RootMask.size() == 0) ||
27937 OpMask.size() == RootMask.size()) &&
27938 "The smaller number of elements must divide the larger.");
27939 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27940 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27941 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27942 assert(((RootRatio == 1 && OpRatio == 1) ||
27943 (RootRatio == 1) != (OpRatio == 1)) &&
27944 "Must not have a ratio for both incoming and op masks!");
27946 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27948 // Merge this shuffle operation's mask into our accumulated mask. Note that
27949 // this shuffle's mask will be the first applied to the input, followed by the
27950 // root mask to get us all the way to the root value arrangement. The reason
27951 // for this order is that we are recursing up the operation chain.
27952 for (int i = 0; i < MaskWidth; ++i) {
27953 int RootIdx = i / RootRatio;
27954 if (RootMask[RootIdx] < 0) {
27955 // This is a zero or undef lane, we're done.
27956 Mask[i] = RootMask[RootIdx];
27960 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27962 // Just insert the scaled root mask value if it references an input other
27963 // than the SrcOp we're currently inserting.
27964 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27965 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27966 Mask[i] = RootMaskedIdx;
27970 RootMaskedIdx %= MaskWidth;
27972 int OpIdx = RootMaskedIdx / OpRatio;
27973 if (OpMask[OpIdx] < 0) {
27974 // The incoming lanes are zero or undef, it doesn't matter which ones we
27976 Mask[i] = OpMask[OpIdx];
27980 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27981 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27982 OpMaskedIdx %= MaskWidth;
27984 if (OpMask[OpIdx] < (int)OpMask.size()) {
27985 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27986 OpMaskedIdx += InputIdx0 * MaskWidth;
27988 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27989 OpMaskedIdx += InputIdx1 * MaskWidth;
27992 Mask[i] = OpMaskedIdx;
27995 // Handle the all undef/zero cases early.
27996 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27997 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28000 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28001 // TODO - should we handle the mixed zero/undef case as well? Just returning
28002 // a zero mask will lose information on undef elements possibly reducing
28003 // future combine possibilities.
28004 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28005 Subtarget, DAG, SDLoc(Root)));
28009 // Remove unused shuffle source ops.
28010 resolveTargetShuffleInputsAndMask(Ops, Mask);
28011 assert(!Ops.empty() && "Shuffle with no inputs detected");
28013 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28015 // Update the list of shuffle nodes that have been combined so far.
28016 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28018 CombinedNodes.push_back(Op.getNode());
28020 // See if we can recurse into each shuffle source op (if it's a target
28021 // shuffle). The source op should only be combined if it either has a
28022 // single use (i.e. current Op) or all its users have already been combined.
28023 for (int i = 0, e = Ops.size(); i < e; ++i)
28024 if (Ops[i].getNode()->hasOneUse() ||
28025 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28026 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28027 Depth + 1, HasVariableMask, DAG, DCI,
28031 // Attempt to constant fold all of the constant source ops.
28032 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28036 // We can only combine unary and binary shuffle mask cases.
28037 if (Ops.size() > 2)
28040 // Minor canonicalization of the accumulated shuffle mask to make it easier
28041 // to match below. All this does is detect masks with sequential pairs of
28042 // elements, and shrink them to the half-width mask. It does this in a loop
28043 // so it will reduce the size of the mask to the minimal width mask which
28044 // performs an equivalent shuffle.
28045 SmallVector<int, 64> WidenedMask;
28046 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28047 Mask = std::move(WidenedMask);
28050 // Canonicalization of binary shuffle masks to improve pattern matching by
28051 // commuting the inputs.
28052 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28053 ShuffleVectorSDNode::commuteMask(Mask);
28054 std::swap(Ops[0], Ops[1]);
28057 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28061 /// \brief Get the PSHUF-style mask from PSHUF node.
28063 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28064 /// PSHUF-style masks that can be reused with such instructions.
28065 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28066 MVT VT = N.getSimpleValueType();
28067 SmallVector<int, 4> Mask;
28068 SmallVector<SDValue, 2> Ops;
28071 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28075 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28076 // matter. Check that the upper masks are repeats and remove them.
28077 if (VT.getSizeInBits() > 128) {
28078 int LaneElts = 128 / VT.getScalarSizeInBits();
28080 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28081 for (int j = 0; j < LaneElts; ++j)
28082 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28083 "Mask doesn't repeat in high 128-bit lanes!");
28085 Mask.resize(LaneElts);
28088 switch (N.getOpcode()) {
28089 case X86ISD::PSHUFD:
28091 case X86ISD::PSHUFLW:
28094 case X86ISD::PSHUFHW:
28095 Mask.erase(Mask.begin(), Mask.begin() + 4);
28096 for (int &M : Mask)
28100 llvm_unreachable("No valid shuffle instruction found!");
28104 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28106 /// We walk up the chain and look for a combinable shuffle, skipping over
28107 /// shuffles that we could hoist this shuffle's transformation past without
28108 /// altering anything.
28110 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28111 SelectionDAG &DAG) {
28112 assert(N.getOpcode() == X86ISD::PSHUFD &&
28113 "Called with something other than an x86 128-bit half shuffle!");
28116 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28117 // of the shuffles in the chain so that we can form a fresh chain to replace
28119 SmallVector<SDValue, 8> Chain;
28120 SDValue V = N.getOperand(0);
28121 for (; V.hasOneUse(); V = V.getOperand(0)) {
28122 switch (V.getOpcode()) {
28124 return SDValue(); // Nothing combined!
28127 // Skip bitcasts as we always know the type for the target specific
28131 case X86ISD::PSHUFD:
28132 // Found another dword shuffle.
28135 case X86ISD::PSHUFLW:
28136 // Check that the low words (being shuffled) are the identity in the
28137 // dword shuffle, and the high words are self-contained.
28138 if (Mask[0] != 0 || Mask[1] != 1 ||
28139 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28142 Chain.push_back(V);
28145 case X86ISD::PSHUFHW:
28146 // Check that the high words (being shuffled) are the identity in the
28147 // dword shuffle, and the low words are self-contained.
28148 if (Mask[2] != 2 || Mask[3] != 3 ||
28149 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28152 Chain.push_back(V);
28155 case X86ISD::UNPCKL:
28156 case X86ISD::UNPCKH:
28157 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28158 // shuffle into a preceding word shuffle.
28159 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28160 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28163 // Search for a half-shuffle which we can combine with.
28164 unsigned CombineOp =
28165 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28166 if (V.getOperand(0) != V.getOperand(1) ||
28167 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28169 Chain.push_back(V);
28170 V = V.getOperand(0);
28172 switch (V.getOpcode()) {
28174 return SDValue(); // Nothing to combine.
28176 case X86ISD::PSHUFLW:
28177 case X86ISD::PSHUFHW:
28178 if (V.getOpcode() == CombineOp)
28181 Chain.push_back(V);
28185 V = V.getOperand(0);
28189 } while (V.hasOneUse());
28192 // Break out of the loop if we break out of the switch.
28196 if (!V.hasOneUse())
28197 // We fell out of the loop without finding a viable combining instruction.
28200 // Merge this node's mask and our incoming mask.
28201 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28202 for (int &M : Mask)
28204 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28205 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28207 // Rebuild the chain around this new shuffle.
28208 while (!Chain.empty()) {
28209 SDValue W = Chain.pop_back_val();
28211 if (V.getValueType() != W.getOperand(0).getValueType())
28212 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28214 switch (W.getOpcode()) {
28216 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28218 case X86ISD::UNPCKL:
28219 case X86ISD::UNPCKH:
28220 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28223 case X86ISD::PSHUFD:
28224 case X86ISD::PSHUFLW:
28225 case X86ISD::PSHUFHW:
28226 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28230 if (V.getValueType() != N.getValueType())
28231 V = DAG.getBitcast(N.getValueType(), V);
28233 // Return the new chain to replace N.
28237 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28240 /// We walk up the chain, skipping shuffles of the other half and looking
28241 /// through shuffles which switch halves trying to find a shuffle of the same
28242 /// pair of dwords.
28243 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28245 TargetLowering::DAGCombinerInfo &DCI) {
28247 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28248 "Called with something other than an x86 128-bit half shuffle!");
28250 unsigned CombineOpcode = N.getOpcode();
28252 // Walk up a single-use chain looking for a combinable shuffle.
28253 SDValue V = N.getOperand(0);
28254 for (; V.hasOneUse(); V = V.getOperand(0)) {
28255 switch (V.getOpcode()) {
28257 return false; // Nothing combined!
28260 // Skip bitcasts as we always know the type for the target specific
28264 case X86ISD::PSHUFLW:
28265 case X86ISD::PSHUFHW:
28266 if (V.getOpcode() == CombineOpcode)
28269 // Other-half shuffles are no-ops.
28272 // Break out of the loop if we break out of the switch.
28276 if (!V.hasOneUse())
28277 // We fell out of the loop without finding a viable combining instruction.
28280 // Combine away the bottom node as its shuffle will be accumulated into
28281 // a preceding shuffle.
28282 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28284 // Record the old value.
28287 // Merge this node's mask and our incoming mask (adjusted to account for all
28288 // the pshufd instructions encountered).
28289 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28290 for (int &M : Mask)
28292 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28293 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28295 // Check that the shuffles didn't cancel each other out. If not, we need to
28296 // combine to the new one.
28298 // Replace the combinable shuffle with the combined one, updating all users
28299 // so that we re-evaluate the chain here.
28300 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28305 /// \brief Try to combine x86 target specific shuffles.
28306 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28307 TargetLowering::DAGCombinerInfo &DCI,
28308 const X86Subtarget &Subtarget) {
28310 MVT VT = N.getSimpleValueType();
28311 SmallVector<int, 4> Mask;
28313 unsigned Opcode = N.getOpcode();
28315 case X86ISD::PSHUFD:
28316 case X86ISD::PSHUFLW:
28317 case X86ISD::PSHUFHW:
28318 Mask = getPSHUFShuffleMask(N);
28319 assert(Mask.size() == 4);
28321 case X86ISD::UNPCKL: {
28322 auto Op0 = N.getOperand(0);
28323 auto Op1 = N.getOperand(1);
28324 unsigned Opcode0 = Op0.getOpcode();
28325 unsigned Opcode1 = Op1.getOpcode();
28327 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28328 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28329 // TODO: Add other horizontal operations as required.
28330 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28331 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28333 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28334 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28335 // moves upper half elements into the lower half part. For example:
28337 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28339 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28341 // will be combined to:
28343 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28345 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28346 // happen due to advanced instructions.
28347 if (!VT.is128BitVector())
28350 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28351 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28353 unsigned NumElts = VT.getVectorNumElements();
28354 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28355 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28358 auto ShufOp = Op1.getOperand(0);
28359 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28360 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28364 case X86ISD::BLENDI: {
28365 SDValue V0 = N->getOperand(0);
28366 SDValue V1 = N->getOperand(1);
28367 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28368 "Unexpected input vector types");
28370 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28371 // operands and changing the mask to 1. This saves us a bunch of
28372 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28373 // x86InstrInfo knows how to commute this back after instruction selection
28374 // if it would help register allocation.
28376 // TODO: If optimizing for size or a processor that doesn't suffer from
28377 // partial register update stalls, this should be transformed into a MOVSD
28378 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28380 if (VT == MVT::v2f64)
28381 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28382 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28383 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28384 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28389 case X86ISD::MOVSD:
28390 case X86ISD::MOVSS: {
28391 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28392 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28393 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28394 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28395 if (isZero0 && isZero1)
28398 // We often lower to MOVSD/MOVSS from integer as well as native float
28399 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28400 // easier to combine shuffles later on. We've already accounted for the
28401 // domain switching cost when we decided to lower with it.
28402 bool isFloat = VT.isFloatingPoint();
28403 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28404 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28405 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28406 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28407 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28408 V0 = DAG.getBitcast(NewVT, V0);
28409 V1 = DAG.getBitcast(NewVT, V1);
28410 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28415 case X86ISD::INSERTPS: {
28416 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28417 SDValue Op0 = N.getOperand(0);
28418 SDValue Op1 = N.getOperand(1);
28419 SDValue Op2 = N.getOperand(2);
28420 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28421 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28422 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28423 unsigned ZeroMask = InsertPSMask & 0xF;
28425 // If we zero out all elements from Op0 then we don't need to reference it.
28426 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28427 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28428 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28430 // If we zero out the element from Op1 then we don't need to reference it.
28431 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28432 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28433 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28435 // Attempt to merge insertps Op1 with an inner target shuffle node.
28436 SmallVector<int, 8> TargetMask1;
28437 SmallVector<SDValue, 2> Ops1;
28438 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28439 int M = TargetMask1[SrcIdx];
28440 if (isUndefOrZero(M)) {
28441 // Zero/UNDEF insertion - zero out element and remove dependency.
28442 InsertPSMask |= (1u << DstIdx);
28443 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28444 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28446 // Update insertps mask srcidx and reference the source input directly.
28447 assert(0 <= M && M < 8 && "Shuffle index out of range");
28448 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28449 Op1 = Ops1[M < 4 ? 0 : 1];
28450 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28451 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28454 // Attempt to merge insertps Op0 with an inner target shuffle node.
28455 SmallVector<int, 8> TargetMask0;
28456 SmallVector<SDValue, 2> Ops0;
28457 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28460 bool Updated = false;
28461 bool UseInput00 = false;
28462 bool UseInput01 = false;
28463 for (int i = 0; i != 4; ++i) {
28464 int M = TargetMask0[i];
28465 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28466 // No change if element is already zero or the inserted element.
28468 } else if (isUndefOrZero(M)) {
28469 // If the target mask is undef/zero then we must zero the element.
28470 InsertPSMask |= (1u << i);
28475 // The input vector element must be inline.
28476 if (M != i && M != (i + 4))
28479 // Determine which inputs of the target shuffle we're using.
28480 UseInput00 |= (0 <= M && M < 4);
28481 UseInput01 |= (4 <= M);
28484 // If we're not using both inputs of the target shuffle then use the
28485 // referenced input directly.
28486 if (UseInput00 && !UseInput01) {
28489 } else if (!UseInput00 && UseInput01) {
28495 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28496 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28504 // Nuke no-op shuffles that show up after combining.
28505 if (isNoopShuffleMask(Mask))
28506 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28508 // Look for simplifications involving one or two shuffle instructions.
28509 SDValue V = N.getOperand(0);
28510 switch (N.getOpcode()) {
28513 case X86ISD::PSHUFLW:
28514 case X86ISD::PSHUFHW:
28515 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28517 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28518 return SDValue(); // We combined away this shuffle, so we're done.
28520 // See if this reduces to a PSHUFD which is no more expensive and can
28521 // combine with more operations. Note that it has to at least flip the
28522 // dwords as otherwise it would have been removed as a no-op.
28523 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28524 int DMask[] = {0, 1, 2, 3};
28525 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28526 DMask[DOffset + 0] = DOffset + 1;
28527 DMask[DOffset + 1] = DOffset + 0;
28528 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28529 V = DAG.getBitcast(DVT, V);
28530 DCI.AddToWorklist(V.getNode());
28531 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28532 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28533 DCI.AddToWorklist(V.getNode());
28534 return DAG.getBitcast(VT, V);
28537 // Look for shuffle patterns which can be implemented as a single unpack.
28538 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28539 // only works when we have a PSHUFD followed by two half-shuffles.
28540 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28541 (V.getOpcode() == X86ISD::PSHUFLW ||
28542 V.getOpcode() == X86ISD::PSHUFHW) &&
28543 V.getOpcode() != N.getOpcode() &&
28545 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28546 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28547 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28548 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28549 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28550 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28552 for (int i = 0; i < 4; ++i) {
28553 WordMask[i + NOffset] = Mask[i] + NOffset;
28554 WordMask[i + VOffset] = VMask[i] + VOffset;
28556 // Map the word mask through the DWord mask.
28558 for (int i = 0; i < 8; ++i)
28559 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28560 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28561 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28562 // We can replace all three shuffles with an unpack.
28563 V = DAG.getBitcast(VT, D.getOperand(0));
28564 DCI.AddToWorklist(V.getNode());
28565 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28574 case X86ISD::PSHUFD:
28575 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28584 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28585 /// operation. If true is returned then the operands of ADDSUB operation
28586 /// are written to the parameters \p Opnd0 and \p Opnd1.
28588 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28589 /// so it is easier to generically match. We also insert dummy vector shuffle
28590 /// nodes for the operands which explicitly discard the lanes which are unused
28591 /// by this operation to try to flow through the rest of the combiner
28592 /// the fact that they're unused.
28593 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28594 SDValue &Opnd0, SDValue &Opnd1) {
28596 EVT VT = N->getValueType(0);
28597 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28598 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28599 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28602 // We only handle target-independent shuffles.
28603 // FIXME: It would be easy and harmless to use the target shuffle mask
28604 // extraction tool to support more.
28605 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28608 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28609 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28611 SDValue V1 = N->getOperand(0);
28612 SDValue V2 = N->getOperand(1);
28614 // We require the first shuffle operand to be the FSUB node, and the second to
28615 // be the FADD node.
28616 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28617 ShuffleVectorSDNode::commuteMask(Mask);
28619 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28622 // If there are other uses of these operations we can't fold them.
28623 if (!V1->hasOneUse() || !V2->hasOneUse())
28626 // Ensure that both operations have the same operands. Note that we can
28627 // commute the FADD operands.
28628 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28629 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28630 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28633 // We're looking for blends between FADD and FSUB nodes. We insist on these
28634 // nodes being lined up in a specific expected pattern.
28635 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28636 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28637 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28638 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28639 8, 25, 10, 27, 12, 29, 14, 31})))
28647 /// \brief Try to combine a shuffle into a target-specific add-sub or
28648 /// mul-add-sub node.
28649 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28650 const X86Subtarget &Subtarget,
28651 SelectionDAG &DAG) {
28652 SDValue Opnd0, Opnd1;
28653 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28656 EVT VT = N->getValueType(0);
28659 // Try to generate X86ISD::FMADDSUB node here.
28661 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28662 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28664 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28665 // the ADDSUB idiom has been successfully recognized. There are no known
28666 // X86 targets with 512-bit ADDSUB instructions!
28667 if (VT.is512BitVector())
28670 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28673 // We are looking for a shuffle where both sources are concatenated with undef
28674 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28675 // if we can express this as a single-source shuffle, that's preferable.
28676 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28677 const X86Subtarget &Subtarget) {
28678 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28681 EVT VT = N->getValueType(0);
28683 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28684 if (!VT.is128BitVector() && !VT.is256BitVector())
28687 if (VT.getVectorElementType() != MVT::i32 &&
28688 VT.getVectorElementType() != MVT::i64 &&
28689 VT.getVectorElementType() != MVT::f32 &&
28690 VT.getVectorElementType() != MVT::f64)
28693 SDValue N0 = N->getOperand(0);
28694 SDValue N1 = N->getOperand(1);
28696 // Check that both sources are concats with undef.
28697 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28698 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28699 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28700 !N1.getOperand(1).isUndef())
28703 // Construct the new shuffle mask. Elements from the first source retain their
28704 // index, but elements from the second source no longer need to skip an undef.
28705 SmallVector<int, 8> Mask;
28706 int NumElts = VT.getVectorNumElements();
28708 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28709 for (int Elt : SVOp->getMask())
28710 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28713 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28715 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28718 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28719 TargetLowering::DAGCombinerInfo &DCI,
28720 const X86Subtarget &Subtarget) {
28722 EVT VT = N->getValueType(0);
28723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28724 // If we have legalized the vector types, look for blends of FADD and FSUB
28725 // nodes that we can fuse into an ADDSUB node.
28726 if (TLI.isTypeLegal(VT))
28727 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28730 // During Type Legalization, when promoting illegal vector types,
28731 // the backend might introduce new shuffle dag nodes and bitcasts.
28733 // This code performs the following transformation:
28734 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28735 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28737 // We do this only if both the bitcast and the BINOP dag nodes have
28738 // one use. Also, perform this transformation only if the new binary
28739 // operation is legal. This is to avoid introducing dag nodes that
28740 // potentially need to be further expanded (or custom lowered) into a
28741 // less optimal sequence of dag nodes.
28742 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28743 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28744 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28745 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28746 SDValue N0 = N->getOperand(0);
28747 SDValue N1 = N->getOperand(1);
28749 SDValue BC0 = N0.getOperand(0);
28750 EVT SVT = BC0.getValueType();
28751 unsigned Opcode = BC0.getOpcode();
28752 unsigned NumElts = VT.getVectorNumElements();
28754 if (BC0.hasOneUse() && SVT.isVector() &&
28755 SVT.getVectorNumElements() * 2 == NumElts &&
28756 TLI.isOperationLegal(Opcode, VT)) {
28757 bool CanFold = false;
28763 // isOperationLegal lies for integer ops on floating point types.
28764 CanFold = VT.isInteger();
28769 // isOperationLegal lies for floating point ops on integer types.
28770 CanFold = VT.isFloatingPoint();
28774 unsigned SVTNumElts = SVT.getVectorNumElements();
28775 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28776 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28777 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28778 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28779 CanFold = SVOp->getMaskElt(i) < 0;
28782 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28783 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28784 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28785 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28790 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28791 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28792 // consecutive, non-overlapping, and in the right order.
28793 SmallVector<SDValue, 16> Elts;
28794 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28795 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28796 Elts.push_back(Elt);
28803 if (Elts.size() == VT.getVectorNumElements())
28804 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28807 // For AVX2, we sometimes want to combine
28808 // (vector_shuffle <mask> (concat_vectors t1, undef)
28809 // (concat_vectors t2, undef))
28811 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28812 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28813 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28816 if (isTargetShuffle(N->getOpcode())) {
28818 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28821 // Try recursively combining arbitrary sequences of x86 shuffle
28822 // instructions into higher-order shuffles. We do this after combining
28823 // specific PSHUF instruction sequences into their minimal form so that we
28824 // can evaluate how many specialized shuffle instructions are involved in
28825 // a particular chain.
28826 SmallVector<int, 1> NonceMask; // Just a placeholder.
28827 NonceMask.push_back(0);
28828 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28829 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28831 return SDValue(); // This routine will use CombineTo to replace N.
28837 /// Check if a vector extract from a target-specific shuffle of a load can be
28838 /// folded into a single element load.
28839 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28840 /// shuffles have been custom lowered so we need to handle those here.
28841 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28842 TargetLowering::DAGCombinerInfo &DCI) {
28843 if (DCI.isBeforeLegalizeOps())
28846 SDValue InVec = N->getOperand(0);
28847 SDValue EltNo = N->getOperand(1);
28848 EVT EltVT = N->getValueType(0);
28850 if (!isa<ConstantSDNode>(EltNo))
28853 EVT OriginalVT = InVec.getValueType();
28855 // Peek through bitcasts, don't duplicate a load with other uses.
28856 InVec = peekThroughOneUseBitcasts(InVec);
28858 EVT CurrentVT = InVec.getValueType();
28859 if (!CurrentVT.isVector() ||
28860 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28863 if (!isTargetShuffle(InVec.getOpcode()))
28866 // Don't duplicate a load with other uses.
28867 if (!InVec.hasOneUse())
28870 SmallVector<int, 16> ShuffleMask;
28871 SmallVector<SDValue, 2> ShuffleOps;
28873 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28874 ShuffleOps, ShuffleMask, UnaryShuffle))
28877 // Select the input vector, guarding against out of range extract vector.
28878 unsigned NumElems = CurrentVT.getVectorNumElements();
28879 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28880 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28882 if (Idx == SM_SentinelZero)
28883 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28884 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28885 if (Idx == SM_SentinelUndef)
28886 return DAG.getUNDEF(EltVT);
28888 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28889 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28892 // If inputs to shuffle are the same for both ops, then allow 2 uses
28893 unsigned AllowedUses =
28894 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28896 if (LdNode.getOpcode() == ISD::BITCAST) {
28897 // Don't duplicate a load with other uses.
28898 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28901 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28902 LdNode = LdNode.getOperand(0);
28905 if (!ISD::isNormalLoad(LdNode.getNode()))
28908 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28910 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28913 // If there's a bitcast before the shuffle, check if the load type and
28914 // alignment is valid.
28915 unsigned Align = LN0->getAlignment();
28916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28917 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28918 EltVT.getTypeForEVT(*DAG.getContext()));
28920 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28923 // All checks match so transform back to vector_shuffle so that DAG combiner
28924 // can finish the job
28927 // Create shuffle node taking into account the case that its a unary shuffle
28928 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28929 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28931 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28932 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28936 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28937 const X86Subtarget &Subtarget) {
28938 SDValue N0 = N->getOperand(0);
28939 EVT VT = N->getValueType(0);
28940 EVT SrcVT = N0.getValueType();
28942 // Since MMX types are special and don't usually play with other vector types,
28943 // it's better to handle them early to be sure we emit efficient code by
28944 // avoiding store-load conversions.
28946 // Detect bitcasts between i32 to x86mmx low word.
28947 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28948 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
28949 SDValue N00 = N0->getOperand(0);
28950 if (N00.getValueType() == MVT::i32)
28951 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28954 // Detect bitcasts between element or subvector extraction to x86mmx.
28955 if (VT == MVT::x86mmx &&
28956 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
28957 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
28958 isNullConstant(N0.getOperand(1))) {
28959 SDValue N00 = N0->getOperand(0);
28960 if (N00.getValueType().is128BitVector())
28961 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
28962 DAG.getBitcast(MVT::v2i64, N00));
28965 // Detect bitcasts from FP_TO_SINT to x86mmx.
28966 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
28967 N0.getOpcode() == ISD::FP_TO_SINT) {
28969 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
28970 DAG.getUNDEF(MVT::v2i32));
28971 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
28972 DAG.getBitcast(MVT::v2i64, Res));
28975 // Convert a bitcasted integer logic operation that has one bitcasted
28976 // floating-point operand into a floating-point logic operation. This may
28977 // create a load of a constant, but that is cheaper than materializing the
28978 // constant in an integer register and transferring it to an SSE register or
28979 // transferring the SSE operand to integer register and back.
28981 switch (N0.getOpcode()) {
28982 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28983 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28984 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28985 default: return SDValue();
28988 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28989 (Subtarget.hasSSE2() && VT == MVT::f64)))
28992 SDValue LogicOp0 = N0.getOperand(0);
28993 SDValue LogicOp1 = N0.getOperand(1);
28996 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28997 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28998 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28999 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29000 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29001 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29003 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29004 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29005 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29006 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29007 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29008 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29014 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29015 // the elements of a vector.
29016 // Returns the vector that is being reduced on, or SDValue() if a reduction
29017 // was not matched.
29018 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29019 // The pattern must end in an extract from index 0.
29020 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29021 !isNullConstant(Extract->getOperand(1)))
29025 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29027 SDValue Op = Extract->getOperand(0);
29028 // At each stage, we're looking for something that looks like:
29029 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29030 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29031 // i32 undef, i32 undef, i32 undef, i32 undef>
29032 // %a = binop <8 x i32> %op, %s
29033 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29034 // we expect something like:
29035 // <4,5,6,7,u,u,u,u>
29036 // <2,3,u,u,u,u,u,u>
29037 // <1,u,u,u,u,u,u,u>
29038 for (unsigned i = 0; i < Stages; ++i) {
29039 if (Op.getOpcode() != BinOp)
29042 ShuffleVectorSDNode *Shuffle =
29043 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29045 Op = Op.getOperand(1);
29047 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29048 Op = Op.getOperand(0);
29051 // The first operand of the shuffle should be the same as the other operand
29053 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29056 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29057 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29058 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29065 // Given a select, detect the following pattern:
29066 // 1: %2 = zext <N x i8> %0 to <N x i32>
29067 // 2: %3 = zext <N x i8> %1 to <N x i32>
29068 // 3: %4 = sub nsw <N x i32> %2, %3
29069 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29070 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29071 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29072 // This is useful as it is the input into a SAD pattern.
29073 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29075 // Check the condition of the select instruction is greater-than.
29076 SDValue SetCC = Select->getOperand(0);
29077 if (SetCC.getOpcode() != ISD::SETCC)
29079 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29080 if (CC != ISD::SETGT && CC != ISD::SETLT)
29083 SDValue SelectOp1 = Select->getOperand(1);
29084 SDValue SelectOp2 = Select->getOperand(2);
29086 // The following instructions assume SelectOp1 is the subtraction operand
29087 // and SelectOp2 is the negation operand.
29088 // In the case of SETLT this is the other way around.
29089 if (CC == ISD::SETLT)
29090 std::swap(SelectOp1, SelectOp2);
29092 // The second operand of the select should be the negation of the first
29093 // operand, which is implemented as 0 - SelectOp1.
29094 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29095 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29096 SelectOp2.getOperand(1) == SelectOp1))
29099 // The first operand of SetCC is the first operand of the select, which is the
29100 // difference between the two input vectors.
29101 if (SetCC.getOperand(0) != SelectOp1)
29104 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29106 if ((CC == ISD::SETLT) &&
29107 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29109 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29112 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29113 if ((CC == ISD::SETGT) &&
29114 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29115 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29118 // The first operand of the select is the difference between the two input
29120 if (SelectOp1.getOpcode() != ISD::SUB)
29123 Op0 = SelectOp1.getOperand(0);
29124 Op1 = SelectOp1.getOperand(1);
29126 // Check if the operands of the sub are zero-extended from vectors of i8.
29127 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29128 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29129 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29130 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29136 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29138 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29139 const SDValue &Zext1, const SDLoc &DL) {
29141 // Find the appropriate width for the PSADBW.
29142 EVT InVT = Zext0.getOperand(0).getValueType();
29143 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29145 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29146 // fill in the missing vector elements with 0.
29147 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29148 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29149 Ops[0] = Zext0.getOperand(0);
29150 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29151 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29152 Ops[0] = Zext1.getOperand(0);
29153 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29155 // Actually build the SAD
29156 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29157 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29160 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29161 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29163 const X86Subtarget &Subtarget) {
29164 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29165 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29168 EVT ExtractVT = Extract->getValueType(0);
29169 unsigned BitWidth = ExtractVT.getSizeInBits();
29170 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29171 ExtractVT != MVT::i8)
29174 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29175 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29176 SDValue Match = matchBinOpReduction(Extract, Op);
29180 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29181 // which we can't support here for now.
29182 if (Match.getScalarValueSizeInBits() != BitWidth)
29185 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29186 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29187 if (!(MatchSizeInBits == 128 ||
29188 (MatchSizeInBits == 256 &&
29189 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29192 // Don't bother performing this for 2-element vectors.
29193 if (Match.getValueType().getVectorNumElements() <= 2)
29196 // Check that we are extracting a reduction of all sign bits.
29197 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29200 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29202 if (64 == BitWidth || 32 == BitWidth)
29203 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29204 MatchSizeInBits / BitWidth);
29206 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29209 ISD::CondCode CondCode;
29210 if (Op == ISD::OR) {
29211 // any_of -> MOVMSK != 0
29212 CompareBits = APInt::getNullValue(32);
29213 CondCode = ISD::CondCode::SETNE;
29215 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29216 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29217 CondCode = ISD::CondCode::SETEQ;
29220 // Perform the select as i32/i64 and then truncate to avoid partial register
29222 unsigned ResWidth = std::max(BitWidth, 32u);
29223 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29225 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29226 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29227 SDValue Res = DAG.getBitcast(MaskVT, Match);
29228 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29229 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29230 Ones, Zero, CondCode);
29231 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29237 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29238 const X86Subtarget &Subtarget) {
29239 // PSADBW is only supported on SSE2 and up.
29240 if (!Subtarget.hasSSE2())
29243 // Verify the type we're extracting from is any integer type above i16.
29244 EVT VT = Extract->getOperand(0).getValueType();
29245 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29248 unsigned RegSize = 128;
29249 if (Subtarget.hasBWI())
29251 else if (Subtarget.hasAVX2())
29254 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29255 // TODO: We should be able to handle larger vectors by splitting them before
29256 // feeding them into several SADs, and then reducing over those.
29257 if (RegSize / VT.getVectorNumElements() < 8)
29260 // Match shuffle + add pyramid.
29261 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29263 // The operand is expected to be zero extended from i8
29264 // (verified in detectZextAbsDiff).
29265 // In order to convert to i64 and above, additional any/zero/sign
29266 // extend is expected.
29267 // The zero extend from 32 bit has no mathematical effect on the result.
29268 // Also the sign extend is basically zero extend
29269 // (extends the sign bit which is zero).
29270 // So it is correct to skip the sign/zero extend instruction.
29271 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29272 Root.getOpcode() == ISD::ZERO_EXTEND ||
29273 Root.getOpcode() == ISD::ANY_EXTEND))
29274 Root = Root.getOperand(0);
29276 // If there was a match, we want Root to be a select that is the root of an
29277 // abs-diff pattern.
29278 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29281 // Check whether we have an abs-diff pattern feeding into the select.
29282 SDValue Zext0, Zext1;
29283 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29286 // Create the SAD instruction.
29288 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29290 // If the original vector was wider than 8 elements, sum over the results
29291 // in the SAD vector.
29292 unsigned Stages = Log2_32(VT.getVectorNumElements());
29293 MVT SadVT = SAD.getSimpleValueType();
29295 unsigned SadElems = SadVT.getVectorNumElements();
29297 for(unsigned i = Stages - 3; i > 0; --i) {
29298 SmallVector<int, 16> Mask(SadElems, -1);
29299 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29300 Mask[j] = MaskEnd + j;
29303 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29304 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29308 MVT Type = Extract->getSimpleValueType(0);
29309 unsigned TypeSizeInBits = Type.getSizeInBits();
29310 // Return the lowest TypeSizeInBits bits.
29311 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29312 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29314 Extract->getOperand(1));
29317 // Attempt to peek through a target shuffle and extract the scalar from the
29319 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29320 TargetLowering::DAGCombinerInfo &DCI,
29321 const X86Subtarget &Subtarget) {
29322 if (DCI.isBeforeLegalizeOps())
29325 SDValue Src = N->getOperand(0);
29326 SDValue Idx = N->getOperand(1);
29328 EVT VT = N->getValueType(0);
29329 EVT SrcVT = Src.getValueType();
29330 EVT SrcSVT = SrcVT.getVectorElementType();
29331 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29333 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29334 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29337 // Resolve the target shuffle inputs and mask.
29338 SmallVector<int, 16> Mask;
29339 SmallVector<SDValue, 2> Ops;
29340 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29343 // Attempt to narrow/widen the shuffle mask to the correct size.
29344 if (Mask.size() != NumSrcElts) {
29345 if ((NumSrcElts % Mask.size()) == 0) {
29346 SmallVector<int, 16> ScaledMask;
29347 int Scale = NumSrcElts / Mask.size();
29348 scaleShuffleMask(Scale, Mask, ScaledMask);
29349 Mask = std::move(ScaledMask);
29350 } else if ((Mask.size() % NumSrcElts) == 0) {
29351 SmallVector<int, 16> WidenedMask;
29352 while (Mask.size() > NumSrcElts &&
29353 canWidenShuffleElements(Mask, WidenedMask))
29354 Mask = std::move(WidenedMask);
29355 // TODO - investigate support for wider shuffle masks with known upper
29356 // undef/zero elements for implicit zero-extension.
29360 // Check if narrowing/widening failed.
29361 if (Mask.size() != NumSrcElts)
29364 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29367 // If the shuffle source element is undef/zero then we can just accept it.
29368 if (SrcIdx == SM_SentinelUndef)
29369 return DAG.getUNDEF(VT);
29371 if (SrcIdx == SM_SentinelZero)
29372 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29373 : DAG.getConstant(0, dl, VT);
29375 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29376 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29377 SrcIdx = SrcIdx % Mask.size();
29379 // We can only extract other elements from 128-bit vectors and in certain
29380 // circumstances, depending on SSE-level.
29381 // TODO: Investigate using extract_subvector for larger vectors.
29382 // TODO: Investigate float/double extraction if it will be just stored.
29383 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29384 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29385 assert(SrcSVT == VT && "Unexpected extraction type");
29386 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29387 DAG.getIntPtrConstant(SrcIdx, dl));
29390 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29391 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29392 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29393 "Unexpected extraction type");
29394 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29395 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29396 DAG.getIntPtrConstant(SrcIdx, dl));
29397 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29398 DAG.getValueType(SrcSVT));
29399 return DAG.getZExtOrTrunc(Assert, dl, VT);
29405 /// Detect vector gather/scatter index generation and convert it from being a
29406 /// bunch of shuffles and extracts into a somewhat faster sequence.
29407 /// For i686, the best sequence is apparently storing the value and loading
29408 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29409 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29410 TargetLowering::DAGCombinerInfo &DCI,
29411 const X86Subtarget &Subtarget) {
29412 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29415 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29418 SDValue InputVector = N->getOperand(0);
29419 SDValue EltIdx = N->getOperand(1);
29421 EVT SrcVT = InputVector.getValueType();
29422 EVT VT = N->getValueType(0);
29423 SDLoc dl(InputVector);
29425 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29426 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29427 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29428 SDValue MMXSrc = InputVector.getOperand(0);
29430 // The bitcast source is a direct mmx result.
29431 if (MMXSrc.getValueType() == MVT::x86mmx)
29432 return DAG.getBitcast(VT, InputVector);
29435 // Detect mmx to i32 conversion through a v2i32 elt extract.
29436 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29437 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29438 SDValue MMXSrc = InputVector.getOperand(0);
29440 // The bitcast source is a direct mmx result.
29441 if (MMXSrc.getValueType() == MVT::x86mmx)
29442 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29445 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29446 isa<ConstantSDNode>(EltIdx) &&
29447 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29448 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29449 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29450 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29451 return DAG.getConstant(Res, dl, MVT::i1);
29454 // Check whether this extract is the root of a sum of absolute differences
29455 // pattern. This has to be done here because we really want it to happen
29456 // pre-legalization,
29457 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29460 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29461 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29464 // Only operate on vectors of 4 elements, where the alternative shuffling
29465 // gets to be more expensive.
29466 if (SrcVT != MVT::v4i32)
29469 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29470 // single use which is a sign-extend or zero-extend, and all elements are
29472 SmallVector<SDNode *, 4> Uses;
29473 unsigned ExtractedElements = 0;
29474 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29475 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29476 if (UI.getUse().getResNo() != InputVector.getResNo())
29479 SDNode *Extract = *UI;
29480 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29483 if (Extract->getValueType(0) != MVT::i32)
29485 if (!Extract->hasOneUse())
29487 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29488 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29490 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29493 // Record which element was extracted.
29494 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29495 Uses.push_back(Extract);
29498 // If not all the elements were used, this may not be worthwhile.
29499 if (ExtractedElements != 15)
29502 // Ok, we've now decided to do the transformation.
29503 // If 64-bit shifts are legal, use the extract-shift sequence,
29504 // otherwise bounce the vector off the cache.
29505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29508 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29509 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29510 auto &DL = DAG.getDataLayout();
29511 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29512 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29513 DAG.getConstant(0, dl, VecIdxTy));
29514 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29515 DAG.getConstant(1, dl, VecIdxTy));
29517 SDValue ShAmt = DAG.getConstant(
29518 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29519 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29520 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29521 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29522 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29523 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29524 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29526 // Store the value to a temporary stack slot.
29527 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29528 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29529 MachinePointerInfo());
29531 EVT ElementType = SrcVT.getVectorElementType();
29532 unsigned EltSize = ElementType.getSizeInBits() / 8;
29534 // Replace each use (extract) with a load of the appropriate element.
29535 for (unsigned i = 0; i < 4; ++i) {
29536 uint64_t Offset = EltSize * i;
29537 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29538 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29540 SDValue ScalarAddr =
29541 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29543 // Load the scalar.
29545 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29549 // Replace the extracts
29550 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29551 UE = Uses.end(); UI != UE; ++UI) {
29552 SDNode *Extract = *UI;
29554 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29555 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29558 // The replacement was made in place; don't return anything.
29562 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29563 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29564 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29565 // combineBasicSADPattern.
29566 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29567 TargetLowering::DAGCombinerInfo &DCI,
29568 const X86Subtarget &Subtarget) {
29569 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29572 /// If a vector select has an operand that is -1 or 0, try to simplify the
29573 /// select to a bitwise logic operation.
29575 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29576 TargetLowering::DAGCombinerInfo &DCI,
29577 const X86Subtarget &Subtarget) {
29578 SDValue Cond = N->getOperand(0);
29579 SDValue LHS = N->getOperand(1);
29580 SDValue RHS = N->getOperand(2);
29581 EVT VT = LHS.getValueType();
29582 EVT CondVT = Cond.getValueType();
29584 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29586 if (N->getOpcode() != ISD::VSELECT)
29589 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29591 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29592 // Check if the first operand is all zeros and Cond type is vXi1.
29593 // This situation only applies to avx512.
29594 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29595 CondVT.getVectorElementType() == MVT::i1) {
29596 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29597 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
29598 DAG.getAllOnesConstant(DL, CondVT));
29599 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29600 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
29603 // To use the condition operand as a bitwise mask, it must have elements that
29604 // are the same size as the select elements. Ie, the condition operand must
29605 // have already been promoted from the IR select condition type <N x i1>.
29606 // Don't check if the types themselves are equal because that excludes
29607 // vector floating-point selects.
29608 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29611 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29612 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29614 // Try to invert the condition if true value is not all 1s and false value is
29616 if (!TValIsAllOnes && !FValIsAllZeros &&
29617 // Check if the selector will be produced by CMPP*/PCMP*.
29618 Cond.getOpcode() == ISD::SETCC &&
29619 // Check if SETCC has already been promoted.
29620 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29622 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29623 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29625 if (TValIsAllZeros || FValIsAllOnes) {
29626 SDValue CC = Cond.getOperand(2);
29627 ISD::CondCode NewCC =
29628 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29629 Cond.getOperand(0).getValueType().isInteger());
29630 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29632 std::swap(LHS, RHS);
29633 TValIsAllOnes = FValIsAllOnes;
29634 FValIsAllZeros = TValIsAllZeros;
29638 // vselect Cond, 111..., 000... -> Cond
29639 if (TValIsAllOnes && FValIsAllZeros)
29640 return DAG.getBitcast(VT, Cond);
29642 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29645 // vselect Cond, 111..., X -> or Cond, X
29646 if (TValIsAllOnes) {
29647 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29648 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29649 return DAG.getBitcast(VT, Or);
29652 // vselect Cond, X, 000... -> and Cond, X
29653 if (FValIsAllZeros) {
29654 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29655 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29656 return DAG.getBitcast(VT, And);
29662 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29663 SDValue Cond = N->getOperand(0);
29664 SDValue LHS = N->getOperand(1);
29665 SDValue RHS = N->getOperand(2);
29668 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29669 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29670 if (!TrueC || !FalseC)
29673 // Don't do this for crazy integer types.
29674 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29677 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29678 // so that TrueC (the true value) is larger than FalseC.
29679 bool NeedsCondInvert = false;
29680 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29681 // Efficiently invertible.
29682 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29683 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29684 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29685 NeedsCondInvert = true;
29686 std::swap(TrueC, FalseC);
29689 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29690 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29691 if (NeedsCondInvert) // Invert the condition if needed.
29692 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29693 DAG.getConstant(1, DL, Cond.getValueType()));
29695 // Zero extend the condition if needed.
29696 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29698 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29699 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29700 DAG.getConstant(ShAmt, DL, MVT::i8));
29703 // Optimize cases that will turn into an LEA instruction. This requires
29704 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29705 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29706 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29707 if (N->getValueType(0) == MVT::i32)
29708 Diff = (unsigned)Diff;
29710 bool isFastMultiplier = false;
29712 switch ((unsigned char)Diff) {
29715 case 1: // result = add base, cond
29716 case 2: // result = lea base( , cond*2)
29717 case 3: // result = lea base(cond, cond*2)
29718 case 4: // result = lea base( , cond*4)
29719 case 5: // result = lea base(cond, cond*4)
29720 case 8: // result = lea base( , cond*8)
29721 case 9: // result = lea base(cond, cond*8)
29722 isFastMultiplier = true;
29727 if (isFastMultiplier) {
29728 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29729 if (NeedsCondInvert) // Invert the condition if needed.
29730 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29731 DAG.getConstant(1, DL, Cond.getValueType()));
29733 // Zero extend the condition if needed.
29734 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29735 // Scale the condition by the difference.
29737 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29738 DAG.getConstant(Diff, DL, Cond.getValueType()));
29740 // Add the base if non-zero.
29741 if (FalseC->getAPIntValue() != 0)
29742 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29743 SDValue(FalseC, 0));
29751 // If this is a bitcasted op that can be represented as another type, push the
29752 // the bitcast to the inputs. This allows more opportunities for pattern
29753 // matching masked instructions. This is called when we know that the operation
29754 // is used as one of the inputs of a vselect.
29755 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29756 TargetLowering::DAGCombinerInfo &DCI) {
29757 // Make sure we have a bitcast.
29758 if (OrigOp.getOpcode() != ISD::BITCAST)
29761 SDValue Op = OrigOp.getOperand(0);
29763 // If the operation is used by anything other than the bitcast, we shouldn't
29764 // do this combine as that would replicate the operation.
29765 if (!Op.hasOneUse())
29768 MVT VT = OrigOp.getSimpleValueType();
29769 MVT EltVT = VT.getVectorElementType();
29770 SDLoc DL(Op.getNode());
29772 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29774 Op0 = DAG.getBitcast(VT, Op0);
29775 DCI.AddToWorklist(Op0.getNode());
29776 Op1 = DAG.getBitcast(VT, Op1);
29777 DCI.AddToWorklist(Op1.getNode());
29778 DCI.CombineTo(OrigOp.getNode(),
29779 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29783 unsigned Opcode = Op.getOpcode();
29785 case X86ISD::PALIGNR:
29786 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29787 if (!VT.is128BitVector())
29789 Opcode = X86ISD::VALIGN;
29791 case X86ISD::VALIGN: {
29792 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29794 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29795 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29796 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29797 unsigned EltSize = EltVT.getSizeInBits();
29798 // Make sure we can represent the same shift with the new VT.
29799 if ((ShiftAmt % EltSize) != 0)
29801 Imm = ShiftAmt / EltSize;
29802 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29803 DAG.getConstant(Imm, DL, MVT::i8));
29805 case X86ISD::SHUF128: {
29806 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29808 // Only change element size, not type.
29809 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29811 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29814 case ISD::INSERT_SUBVECTOR: {
29815 unsigned EltSize = EltVT.getSizeInBits();
29816 if (EltSize != 32 && EltSize != 64)
29818 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29819 // Only change element size, not type.
29820 if (EltVT.isInteger() != OpEltVT.isInteger())
29822 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29823 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29824 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29825 DCI.AddToWorklist(Op0.getNode());
29826 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29827 SDValue Op1 = Op.getOperand(1);
29828 MVT Op1VT = MVT::getVectorVT(EltVT,
29829 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29830 Op1 = DAG.getBitcast(Op1VT, Op1);
29831 DCI.AddToWorklist(Op1.getNode());
29832 DCI.CombineTo(OrigOp.getNode(),
29833 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29834 DAG.getIntPtrConstant(Imm, DL)));
29837 case ISD::EXTRACT_SUBVECTOR: {
29838 unsigned EltSize = EltVT.getSizeInBits();
29839 if (EltSize != 32 && EltSize != 64)
29841 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29842 // Only change element size, not type.
29843 if (EltVT.isInteger() != OpEltVT.isInteger())
29845 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29846 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29847 // Op0 needs to be bitcasted to a larger vector with the same element type.
29848 SDValue Op0 = Op.getOperand(0);
29849 MVT Op0VT = MVT::getVectorVT(EltVT,
29850 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29851 Op0 = DAG.getBitcast(Op0VT, Op0);
29852 DCI.AddToWorklist(Op0.getNode());
29853 DCI.CombineTo(OrigOp.getNode(),
29854 DAG.getNode(Opcode, DL, VT, Op0,
29855 DAG.getIntPtrConstant(Imm, DL)));
29858 case X86ISD::SUBV_BROADCAST: {
29859 unsigned EltSize = EltVT.getSizeInBits();
29860 if (EltSize != 32 && EltSize != 64)
29862 // Only change element size, not type.
29863 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29865 SDValue Op0 = Op.getOperand(0);
29866 MVT Op0VT = MVT::getVectorVT(EltVT,
29867 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29868 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29869 DCI.AddToWorklist(Op0.getNode());
29870 DCI.CombineTo(OrigOp.getNode(),
29871 DAG.getNode(Opcode, DL, VT, Op0));
29879 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29880 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29881 TargetLowering::DAGCombinerInfo &DCI,
29882 const X86Subtarget &Subtarget) {
29884 SDValue Cond = N->getOperand(0);
29885 // Get the LHS/RHS of the select.
29886 SDValue LHS = N->getOperand(1);
29887 SDValue RHS = N->getOperand(2);
29888 EVT VT = LHS.getValueType();
29889 EVT CondVT = Cond.getValueType();
29890 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29892 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29893 // instructions match the semantics of the common C idiom x<y?x:y but not
29894 // x<=y?x:y, because of how they handle negative zero (which can be
29895 // ignored in unsafe-math mode).
29896 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29897 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29898 VT != MVT::f80 && VT != MVT::f128 &&
29899 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29900 (Subtarget.hasSSE2() ||
29901 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29902 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29904 unsigned Opcode = 0;
29905 // Check for x CC y ? x : y.
29906 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29907 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29911 // Converting this to a min would handle NaNs incorrectly, and swapping
29912 // the operands would cause it to handle comparisons between positive
29913 // and negative zero incorrectly.
29914 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29915 if (!DAG.getTarget().Options.UnsafeFPMath &&
29916 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29918 std::swap(LHS, RHS);
29920 Opcode = X86ISD::FMIN;
29923 // Converting this to a min would handle comparisons between positive
29924 // and negative zero incorrectly.
29925 if (!DAG.getTarget().Options.UnsafeFPMath &&
29926 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29928 Opcode = X86ISD::FMIN;
29931 // Converting this to a min would handle both negative zeros and NaNs
29932 // incorrectly, but we can swap the operands to fix both.
29933 std::swap(LHS, RHS);
29937 Opcode = X86ISD::FMIN;
29941 // Converting this to a max would handle comparisons between positive
29942 // and negative zero incorrectly.
29943 if (!DAG.getTarget().Options.UnsafeFPMath &&
29944 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29946 Opcode = X86ISD::FMAX;
29949 // Converting this to a max would handle NaNs incorrectly, and swapping
29950 // the operands would cause it to handle comparisons between positive
29951 // and negative zero incorrectly.
29952 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29953 if (!DAG.getTarget().Options.UnsafeFPMath &&
29954 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29956 std::swap(LHS, RHS);
29958 Opcode = X86ISD::FMAX;
29961 // Converting this to a max would handle both negative zeros and NaNs
29962 // incorrectly, but we can swap the operands to fix both.
29963 std::swap(LHS, RHS);
29967 Opcode = X86ISD::FMAX;
29970 // Check for x CC y ? y : x -- a min/max with reversed arms.
29971 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29972 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29976 // Converting this to a min would handle comparisons between positive
29977 // and negative zero incorrectly, and swapping the operands would
29978 // cause it to handle NaNs incorrectly.
29979 if (!DAG.getTarget().Options.UnsafeFPMath &&
29980 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29981 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29983 std::swap(LHS, RHS);
29985 Opcode = X86ISD::FMIN;
29988 // Converting this to a min would handle NaNs incorrectly.
29989 if (!DAG.getTarget().Options.UnsafeFPMath &&
29990 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
29992 Opcode = X86ISD::FMIN;
29995 // Converting this to a min would handle both negative zeros and NaNs
29996 // incorrectly, but we can swap the operands to fix both.
29997 std::swap(LHS, RHS);
30001 Opcode = X86ISD::FMIN;
30005 // Converting this to a max would handle NaNs incorrectly.
30006 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30008 Opcode = X86ISD::FMAX;
30011 // Converting this to a max would handle comparisons between positive
30012 // and negative zero incorrectly, and swapping the operands would
30013 // cause it to handle NaNs incorrectly.
30014 if (!DAG.getTarget().Options.UnsafeFPMath &&
30015 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30016 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30018 std::swap(LHS, RHS);
30020 Opcode = X86ISD::FMAX;
30023 // Converting this to a max would handle both negative zeros and NaNs
30024 // incorrectly, but we can swap the operands to fix both.
30025 std::swap(LHS, RHS);
30029 Opcode = X86ISD::FMAX;
30035 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30038 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30039 // lowering on KNL. In this case we convert it to
30040 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30041 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30042 // Since SKX these selects have a proper lowering.
30043 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30044 CondVT.getVectorElementType() == MVT::i1 &&
30045 (VT.is128BitVector() || VT.is256BitVector()) &&
30046 (VT.getVectorElementType() == MVT::i8 ||
30047 VT.getVectorElementType() == MVT::i16) &&
30048 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30049 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30050 DCI.AddToWorklist(Cond.getNode());
30051 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30054 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30057 // Canonicalize max and min:
30058 // (x > y) ? x : y -> (x >= y) ? x : y
30059 // (x < y) ? x : y -> (x <= y) ? x : y
30060 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30061 // the need for an extra compare
30062 // against zero. e.g.
30063 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30065 // testl %edi, %edi
30067 // cmovgl %edi, %eax
30071 // cmovsl %eax, %edi
30072 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30073 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30074 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30075 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30080 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30081 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30082 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30083 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30088 // Early exit check
30089 if (!TLI.isTypeLegal(VT))
30092 // Match VSELECTs into subs with unsigned saturation.
30093 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30094 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30095 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30096 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30097 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30099 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30100 // left side invert the predicate to simplify logic below.
30102 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30104 CC = ISD::getSetCCInverse(CC, true);
30105 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30109 if (Other.getNode() && Other->getNumOperands() == 2 &&
30110 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30111 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30112 SDValue CondRHS = Cond->getOperand(1);
30114 // Look for a general sub with unsigned saturation first.
30115 // x >= y ? x-y : 0 --> subus x, y
30116 // x > y ? x-y : 0 --> subus x, y
30117 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30118 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30119 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30121 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30122 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30123 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30124 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30125 // If the RHS is a constant we have to reverse the const
30126 // canonicalization.
30127 // x > C-1 ? x+-C : 0 --> subus x, C
30128 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30129 CondRHSConst->getAPIntValue() ==
30130 (-OpRHSConst->getAPIntValue() - 1))
30131 return DAG.getNode(
30132 X86ISD::SUBUS, DL, VT, OpLHS,
30133 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30135 // Another special case: If C was a sign bit, the sub has been
30136 // canonicalized into a xor.
30137 // FIXME: Would it be better to use computeKnownBits to determine
30138 // whether it's safe to decanonicalize the xor?
30139 // x s< 0 ? x^C : 0 --> subus x, C
30140 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30141 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30142 OpRHSConst->getAPIntValue().isSignMask())
30143 // Note that we have to rebuild the RHS constant here to ensure we
30144 // don't rely on particular values of undef lanes.
30145 return DAG.getNode(
30146 X86ISD::SUBUS, DL, VT, OpLHS,
30147 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30152 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30155 // If this is a *dynamic* select (non-constant condition) and we can match
30156 // this node with one of the variable blend instructions, restructure the
30157 // condition so that blends can use the high (sign) bit of each element and
30158 // use SimplifyDemandedBits to simplify the condition operand.
30159 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30160 !DCI.isBeforeLegalize() &&
30161 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30162 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30164 // Don't optimize vector selects that map to mask-registers.
30168 // We can only handle the cases where VSELECT is directly legal on the
30169 // subtarget. We custom lower VSELECT nodes with constant conditions and
30170 // this makes it hard to see whether a dynamic VSELECT will correctly
30171 // lower, so we both check the operation's status and explicitly handle the
30172 // cases where a *dynamic* blend will fail even though a constant-condition
30173 // blend could be custom lowered.
30174 // FIXME: We should find a better way to handle this class of problems.
30175 // Potentially, we should combine constant-condition vselect nodes
30176 // pre-legalization into shuffles and not mark as many types as custom
30178 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30180 // FIXME: We don't support i16-element blends currently. We could and
30181 // should support them by making *all* the bits in the condition be set
30182 // rather than just the high bit and using an i8-element blend.
30183 if (VT.getVectorElementType() == MVT::i16)
30185 // Dynamic blending was only available from SSE4.1 onward.
30186 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30188 // Byte blends are only available in AVX2
30189 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30192 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30193 APInt DemandedMask(APInt::getSignMask(BitWidth));
30195 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30196 DCI.isBeforeLegalizeOps());
30197 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30198 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30199 // If we changed the computation somewhere in the DAG, this change will
30200 // affect all users of Cond. Make sure it is fine and update all the nodes
30201 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30202 // perform wrong optimizations as we messed with the actual expectation
30203 // for the vector boolean values.
30204 if (Cond != TLO.Old) {
30205 // Check all uses of the condition operand to check whether it will be
30206 // consumed by non-BLEND instructions. Those may require that all bits
30207 // are set properly.
30208 for (SDNode *U : Cond->uses()) {
30209 // TODO: Add other opcodes eventually lowered into BLEND.
30210 if (U->getOpcode() != ISD::VSELECT)
30214 // Update all users of the condition before committing the change, so
30215 // that the VSELECT optimizations that expect the correct vector boolean
30216 // value will not be triggered.
30217 for (SDNode *U : Cond->uses()) {
30218 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30219 U->getValueType(0), Cond, U->getOperand(1),
30221 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30223 DCI.CommitTargetLoweringOpt(TLO);
30226 // Only Cond (rather than other nodes in the computation chain) was
30227 // changed. Change the condition just for N to keep the opportunity to
30228 // optimize all other users their own way.
30229 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30230 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30235 // Look for vselects with LHS/RHS being bitcasted from an operation that
30236 // can be executed on another type. Push the bitcast to the inputs of
30237 // the operation. This exposes opportunities for using masking instructions.
30238 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30239 CondVT.getVectorElementType() == MVT::i1) {
30240 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30241 return SDValue(N, 0);
30242 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30243 return SDValue(N, 0);
30250 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30252 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30253 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30254 /// Note that this is only legal for some op/cc combinations.
30255 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30256 SelectionDAG &DAG) {
30257 // This combine only operates on CMP-like nodes.
30258 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30259 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30262 // Can't replace the cmp if it has more uses than the one we're looking at.
30263 // FIXME: We would like to be able to handle this, but would need to make sure
30264 // all uses were updated.
30265 if (!Cmp.hasOneUse())
30268 // This only applies to variations of the common case:
30269 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30270 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30271 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30272 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30273 // Using the proper condcodes (see below), overflow is checked for.
30275 // FIXME: We can generalize both constraints:
30276 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30278 // if the result is compared.
30280 SDValue CmpLHS = Cmp.getOperand(0);
30281 SDValue CmpRHS = Cmp.getOperand(1);
30283 if (!CmpLHS.hasOneUse())
30286 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30287 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30290 const unsigned Opc = CmpLHS.getOpcode();
30292 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30295 SDValue OpRHS = CmpLHS.getOperand(2);
30296 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30300 APInt Addend = OpRHSC->getAPIntValue();
30301 if (Opc == ISD::ATOMIC_LOAD_SUB)
30304 if (CC == X86::COND_S && Addend == 1)
30306 else if (CC == X86::COND_NS && Addend == 1)
30308 else if (CC == X86::COND_G && Addend == -1)
30310 else if (CC == X86::COND_LE && Addend == -1)
30315 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30316 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30317 DAG.getUNDEF(CmpLHS.getValueType()));
30318 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30322 // Check whether a boolean test is testing a boolean value generated by
30323 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30326 // Simplify the following patterns:
30327 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30328 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30329 // to (Op EFLAGS Cond)
30331 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30332 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30333 // to (Op EFLAGS !Cond)
30335 // where Op could be BRCOND or CMOV.
30337 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30338 // This combine only operates on CMP-like nodes.
30339 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30340 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30343 // Quit if not used as a boolean value.
30344 if (CC != X86::COND_E && CC != X86::COND_NE)
30347 // Check CMP operands. One of them should be 0 or 1 and the other should be
30348 // an SetCC or extended from it.
30349 SDValue Op1 = Cmp.getOperand(0);
30350 SDValue Op2 = Cmp.getOperand(1);
30353 const ConstantSDNode* C = nullptr;
30354 bool needOppositeCond = (CC == X86::COND_E);
30355 bool checkAgainstTrue = false; // Is it a comparison against 1?
30357 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30359 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30361 else // Quit if all operands are not constants.
30364 if (C->getZExtValue() == 1) {
30365 needOppositeCond = !needOppositeCond;
30366 checkAgainstTrue = true;
30367 } else if (C->getZExtValue() != 0)
30368 // Quit if the constant is neither 0 or 1.
30371 bool truncatedToBoolWithAnd = false;
30372 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30373 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30374 SetCC.getOpcode() == ISD::TRUNCATE ||
30375 SetCC.getOpcode() == ISD::AND) {
30376 if (SetCC.getOpcode() == ISD::AND) {
30378 if (isOneConstant(SetCC.getOperand(0)))
30380 if (isOneConstant(SetCC.getOperand(1)))
30384 SetCC = SetCC.getOperand(OpIdx);
30385 truncatedToBoolWithAnd = true;
30387 SetCC = SetCC.getOperand(0);
30390 switch (SetCC.getOpcode()) {
30391 case X86ISD::SETCC_CARRY:
30392 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30393 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30394 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30395 // truncated to i1 using 'and'.
30396 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30398 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30399 "Invalid use of SETCC_CARRY!");
30401 case X86ISD::SETCC:
30402 // Set the condition code or opposite one if necessary.
30403 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30404 if (needOppositeCond)
30405 CC = X86::GetOppositeBranchCondition(CC);
30406 return SetCC.getOperand(1);
30407 case X86ISD::CMOV: {
30408 // Check whether false/true value has canonical one, i.e. 0 or 1.
30409 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30410 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30411 // Quit if true value is not a constant.
30414 // Quit if false value is not a constant.
30416 SDValue Op = SetCC.getOperand(0);
30417 // Skip 'zext' or 'trunc' node.
30418 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30419 Op.getOpcode() == ISD::TRUNCATE)
30420 Op = Op.getOperand(0);
30421 // A special case for rdrand/rdseed, where 0 is set if false cond is
30423 if ((Op.getOpcode() != X86ISD::RDRAND &&
30424 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30427 // Quit if false value is not the constant 0 or 1.
30428 bool FValIsFalse = true;
30429 if (FVal && FVal->getZExtValue() != 0) {
30430 if (FVal->getZExtValue() != 1)
30432 // If FVal is 1, opposite cond is needed.
30433 needOppositeCond = !needOppositeCond;
30434 FValIsFalse = false;
30436 // Quit if TVal is not the constant opposite of FVal.
30437 if (FValIsFalse && TVal->getZExtValue() != 1)
30439 if (!FValIsFalse && TVal->getZExtValue() != 0)
30441 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30442 if (needOppositeCond)
30443 CC = X86::GetOppositeBranchCondition(CC);
30444 return SetCC.getOperand(3);
30451 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30453 /// (X86or (X86setcc) (X86setcc))
30454 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30455 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30456 X86::CondCode &CC1, SDValue &Flags,
30458 if (Cond->getOpcode() == X86ISD::CMP) {
30459 if (!isNullConstant(Cond->getOperand(1)))
30462 Cond = Cond->getOperand(0);
30467 SDValue SetCC0, SetCC1;
30468 switch (Cond->getOpcode()) {
30469 default: return false;
30476 SetCC0 = Cond->getOperand(0);
30477 SetCC1 = Cond->getOperand(1);
30481 // Make sure we have SETCC nodes, using the same flags value.
30482 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30483 SetCC1.getOpcode() != X86ISD::SETCC ||
30484 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30487 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30488 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30489 Flags = SetCC0->getOperand(1);
30493 /// Optimize an EFLAGS definition used according to the condition code \p CC
30494 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30495 /// uses of chain values.
30496 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30497 SelectionDAG &DAG) {
30498 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30500 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30503 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30504 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30505 TargetLowering::DAGCombinerInfo &DCI,
30506 const X86Subtarget &Subtarget) {
30509 // If the flag operand isn't dead, don't touch this CMOV.
30510 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30513 SDValue FalseOp = N->getOperand(0);
30514 SDValue TrueOp = N->getOperand(1);
30515 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30516 SDValue Cond = N->getOperand(3);
30518 if (CC == X86::COND_E || CC == X86::COND_NE) {
30519 switch (Cond.getOpcode()) {
30523 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30524 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30525 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30529 // Try to simplify the EFLAGS and condition code operands.
30530 // We can't always do this as FCMOV only supports a subset of X86 cond.
30531 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30532 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30533 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30535 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30539 // If this is a select between two integer constants, try to do some
30540 // optimizations. Note that the operands are ordered the opposite of SELECT
30542 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30543 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30544 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30545 // larger than FalseC (the false value).
30546 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30547 CC = X86::GetOppositeBranchCondition(CC);
30548 std::swap(TrueC, FalseC);
30549 std::swap(TrueOp, FalseOp);
30552 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30553 // This is efficient for any integer data type (including i8/i16) and
30555 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30556 Cond = getSETCC(CC, Cond, DL, DAG);
30558 // Zero extend the condition if needed.
30559 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30561 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30562 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30563 DAG.getConstant(ShAmt, DL, MVT::i8));
30564 if (N->getNumValues() == 2) // Dead flag value?
30565 return DCI.CombineTo(N, Cond, SDValue());
30569 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30570 // for any integer data type, including i8/i16.
30571 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30572 Cond = getSETCC(CC, Cond, DL, DAG);
30574 // Zero extend the condition if needed.
30575 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30576 FalseC->getValueType(0), Cond);
30577 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30578 SDValue(FalseC, 0));
30580 if (N->getNumValues() == 2) // Dead flag value?
30581 return DCI.CombineTo(N, Cond, SDValue());
30585 // Optimize cases that will turn into an LEA instruction. This requires
30586 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30587 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30588 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30589 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30591 bool isFastMultiplier = false;
30593 switch ((unsigned char)Diff) {
30595 case 1: // result = add base, cond
30596 case 2: // result = lea base( , cond*2)
30597 case 3: // result = lea base(cond, cond*2)
30598 case 4: // result = lea base( , cond*4)
30599 case 5: // result = lea base(cond, cond*4)
30600 case 8: // result = lea base( , cond*8)
30601 case 9: // result = lea base(cond, cond*8)
30602 isFastMultiplier = true;
30607 if (isFastMultiplier) {
30608 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30609 Cond = getSETCC(CC, Cond, DL ,DAG);
30610 // Zero extend the condition if needed.
30611 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30613 // Scale the condition by the difference.
30615 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30616 DAG.getConstant(Diff, DL, Cond.getValueType()));
30618 // Add the base if non-zero.
30619 if (FalseC->getAPIntValue() != 0)
30620 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30621 SDValue(FalseC, 0));
30622 if (N->getNumValues() == 2) // Dead flag value?
30623 return DCI.CombineTo(N, Cond, SDValue());
30630 // Handle these cases:
30631 // (select (x != c), e, c) -> select (x != c), e, x),
30632 // (select (x == c), c, e) -> select (x == c), x, e)
30633 // where the c is an integer constant, and the "select" is the combination
30634 // of CMOV and CMP.
30636 // The rationale for this change is that the conditional-move from a constant
30637 // needs two instructions, however, conditional-move from a register needs
30638 // only one instruction.
30640 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30641 // some instruction-combining opportunities. This opt needs to be
30642 // postponed as late as possible.
30644 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30645 // the DCI.xxxx conditions are provided to postpone the optimization as
30646 // late as possible.
30648 ConstantSDNode *CmpAgainst = nullptr;
30649 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30650 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30651 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30653 if (CC == X86::COND_NE &&
30654 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30655 CC = X86::GetOppositeBranchCondition(CC);
30656 std::swap(TrueOp, FalseOp);
30659 if (CC == X86::COND_E &&
30660 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30661 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30662 DAG.getConstant(CC, DL, MVT::i8), Cond };
30663 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30668 // Fold and/or of setcc's to double CMOV:
30669 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30670 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30672 // This combine lets us generate:
30673 // cmovcc1 (jcc1 if we don't have CMOV)
30679 // cmovne (jne if we don't have CMOV)
30680 // When we can't use the CMOV instruction, it might increase branch
30682 // When we can use CMOV, or when there is no mispredict, this improves
30683 // throughput and reduces register pressure.
30685 if (CC == X86::COND_NE) {
30687 X86::CondCode CC0, CC1;
30689 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30691 std::swap(FalseOp, TrueOp);
30692 CC0 = X86::GetOppositeBranchCondition(CC0);
30693 CC1 = X86::GetOppositeBranchCondition(CC1);
30696 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30698 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30699 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30700 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30701 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30709 /// Different mul shrinking modes.
30710 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30712 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30713 EVT VT = N->getOperand(0).getValueType();
30714 if (VT.getScalarSizeInBits() != 32)
30717 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30718 unsigned SignBits[2] = {1, 1};
30719 bool IsPositive[2] = {false, false};
30720 for (unsigned i = 0; i < 2; i++) {
30721 SDValue Opd = N->getOperand(i);
30723 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30724 // compute signbits for it separately.
30725 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30726 // For anyextend, it is safe to assume an appropriate number of leading
30728 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30730 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30735 IsPositive[i] = true;
30736 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30737 // All the operands of BUILD_VECTOR need to be int constant.
30738 // Find the smallest value range which all the operands belong to.
30740 IsPositive[i] = true;
30741 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30742 if (SubOp.isUndef())
30744 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30747 APInt IntVal = CN->getAPIntValue();
30748 if (IntVal.isNegative())
30749 IsPositive[i] = false;
30750 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30753 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30754 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30755 IsPositive[i] = true;
30759 bool AllPositive = IsPositive[0] && IsPositive[1];
30760 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30761 // When ranges are from -128 ~ 127, use MULS8 mode.
30762 if (MinSignBits >= 25)
30764 // When ranges are from 0 ~ 255, use MULU8 mode.
30765 else if (AllPositive && MinSignBits >= 24)
30767 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30768 else if (MinSignBits >= 17)
30770 // When ranges are from 0 ~ 65535, use MULU16 mode.
30771 else if (AllPositive && MinSignBits >= 16)
30778 /// When the operands of vector mul are extended from smaller size values,
30779 /// like i8 and i16, the type of mul may be shrinked to generate more
30780 /// efficient code. Two typical patterns are handled:
30782 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30783 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30784 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30785 /// %5 = mul <N x i32> %2, %4
30788 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30789 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30790 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30791 /// %5 = mul <N x i32> %2, %4
30793 /// There are four mul shrinking modes:
30794 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30795 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30796 /// generate pmullw+sext32 for it (MULS8 mode).
30797 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30798 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30799 /// generate pmullw+zext32 for it (MULU8 mode).
30800 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30801 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30802 /// generate pmullw+pmulhw for it (MULS16 mode).
30803 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30804 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30805 /// generate pmullw+pmulhuw for it (MULU16 mode).
30806 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30807 const X86Subtarget &Subtarget) {
30808 // Check for legality
30809 // pmullw/pmulhw are not supported by SSE.
30810 if (!Subtarget.hasSSE2())
30813 // Check for profitability
30814 // pmulld is supported since SSE41. It is better to use pmulld
30815 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30817 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30818 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30822 if (!canReduceVMulWidth(N, DAG, Mode))
30826 SDValue N0 = N->getOperand(0);
30827 SDValue N1 = N->getOperand(1);
30828 EVT VT = N->getOperand(0).getValueType();
30829 unsigned RegSize = 128;
30830 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30832 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30833 // Shrink the operands of mul.
30834 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30835 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30837 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30838 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30839 // lower part is needed.
30840 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30841 if (Mode == MULU8 || Mode == MULS8) {
30842 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30845 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30846 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30847 // the higher part is also needed.
30848 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30849 ReducedVT, NewN0, NewN1);
30851 // Repack the lower part and higher part result of mul into a wider
30853 // Generate shuffle functioning as punpcklwd.
30854 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30855 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30856 ShuffleMask[2 * i] = i;
30857 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30860 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30861 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30862 // Generate shuffle functioning as punpckhwd.
30863 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30864 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30865 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30868 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30869 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30870 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30873 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30874 // to legalize the mul explicitly because implicit legalization for type
30875 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30876 // instructions which will not exist when we explicitly legalize it by
30877 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30878 // <4 x i16> undef).
30880 // Legalize the operands of mul.
30881 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30882 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30883 if ((RegSize % ReducedSizeInBits) != 0)
30886 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30887 DAG.getUNDEF(ReducedVT));
30889 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30891 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30893 if (Mode == MULU8 || Mode == MULS8) {
30894 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30896 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30898 // convert the type of mul result to VT.
30899 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30900 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30901 : ISD::SIGN_EXTEND_VECTOR_INREG,
30903 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30904 DAG.getIntPtrConstant(0, DL));
30906 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30907 // MULU16/MULS16, both parts are needed.
30908 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30909 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30910 OpsVT, NewN0, NewN1);
30912 // Repack the lower part and higher part result of mul into a wider
30913 // result. Make sure the type of mul result is VT.
30914 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30915 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30916 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30918 DAG.getIntPtrConstant(0, DL));
30923 /// Optimize a single multiply with constant into two operations in order to
30924 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
30925 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
30926 TargetLowering::DAGCombinerInfo &DCI,
30927 const X86Subtarget &Subtarget) {
30928 EVT VT = N->getValueType(0);
30929 if (DCI.isBeforeLegalize() && VT.isVector())
30930 return reduceVMULWidth(N, DAG, Subtarget);
30932 // An imul is usually smaller than the alternative sequence.
30933 if (DAG.getMachineFunction().getFunction()->optForMinSize())
30936 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
30939 if (VT != MVT::i64 && VT != MVT::i32)
30942 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
30945 uint64_t MulAmt = C->getZExtValue();
30946 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
30949 uint64_t MulAmt1 = 0;
30950 uint64_t MulAmt2 = 0;
30951 if ((MulAmt % 9) == 0) {
30953 MulAmt2 = MulAmt / 9;
30954 } else if ((MulAmt % 5) == 0) {
30956 MulAmt2 = MulAmt / 5;
30957 } else if ((MulAmt % 3) == 0) {
30959 MulAmt2 = MulAmt / 3;
30965 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
30967 if (isPowerOf2_64(MulAmt2) &&
30968 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
30969 // If second multiplifer is pow2, issue it first. We want the multiply by
30970 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30972 std::swap(MulAmt1, MulAmt2);
30974 if (isPowerOf2_64(MulAmt1))
30975 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30976 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30978 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30979 DAG.getConstant(MulAmt1, DL, VT));
30981 if (isPowerOf2_64(MulAmt2))
30982 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30983 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
30985 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
30986 DAG.getConstant(MulAmt2, DL, VT));
30990 assert(MulAmt != 0 &&
30991 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
30992 "Both cases that could cause potential overflows should have "
30993 "already been handled.");
30994 int64_t SignMulAmt = C->getSExtValue();
30995 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
30996 (SignMulAmt != -INT64_MAX)) {
30997 int NumSign = SignMulAmt > 0 ? 1 : -1;
30998 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
30999 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31000 if (IsPowerOf2_64PlusOne) {
31001 // (mul x, 2^N + 1) => (add (shl x, N), x)
31002 NewMul = DAG.getNode(
31003 ISD::ADD, DL, VT, N->getOperand(0),
31004 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31005 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31007 } else if (IsPowerOf2_64MinusOne) {
31008 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31009 NewMul = DAG.getNode(
31011 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31012 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31016 // To negate, subtract the number from zero
31017 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31019 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31024 // Do not add new nodes to DAG combiner worklist.
31025 DCI.CombineTo(N, NewMul, false);
31030 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31031 SDValue N0 = N->getOperand(0);
31032 SDValue N1 = N->getOperand(1);
31033 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31034 EVT VT = N0.getValueType();
31036 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31037 // since the result of setcc_c is all zero's or all ones.
31038 if (VT.isInteger() && !VT.isVector() &&
31039 N1C && N0.getOpcode() == ISD::AND &&
31040 N0.getOperand(1).getOpcode() == ISD::Constant) {
31041 SDValue N00 = N0.getOperand(0);
31042 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31043 Mask <<= N1C->getAPIntValue();
31044 bool MaskOK = false;
31045 // We can handle cases concerning bit-widening nodes containing setcc_c if
31046 // we carefully interrogate the mask to make sure we are semantics
31048 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31049 // of the underlying setcc_c operation if the setcc_c was zero extended.
31050 // Consider the following example:
31051 // zext(setcc_c) -> i32 0x0000FFFF
31052 // c1 -> i32 0x0000FFFF
31053 // c2 -> i32 0x00000001
31054 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31055 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31056 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31058 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31059 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31061 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31062 N00.getOpcode() == ISD::ANY_EXTEND) &&
31063 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31064 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31066 if (MaskOK && Mask != 0) {
31068 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31072 // Hardware support for vector shifts is sparse which makes us scalarize the
31073 // vector operations in many cases. Also, on sandybridge ADD is faster than
31075 // (shl V, 1) -> add V,V
31076 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31077 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31078 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31079 // We shift all of the values by one. In many cases we do not have
31080 // hardware support for this operation. This is better expressed as an ADD
31082 if (N1SplatC->getAPIntValue() == 1)
31083 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31089 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31090 SDValue N0 = N->getOperand(0);
31091 SDValue N1 = N->getOperand(1);
31092 EVT VT = N0.getValueType();
31093 unsigned Size = VT.getSizeInBits();
31095 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31096 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31097 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31098 // depending on sign of (SarConst - [56,48,32,24,16])
31100 // sexts in X86 are MOVs. The MOVs have the same code size
31101 // as above SHIFTs (only SHIFT on 1 has lower code size).
31102 // However the MOVs have 2 advantages to a SHIFT:
31103 // 1. MOVs can write to a register that differs from source
31104 // 2. MOVs accept memory operands
31106 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31107 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31108 N0.getOperand(1).getOpcode() != ISD::Constant)
31111 SDValue N00 = N0.getOperand(0);
31112 SDValue N01 = N0.getOperand(1);
31113 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31114 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31115 EVT CVT = N1.getValueType();
31117 if (SarConst.isNegative())
31120 for (MVT SVT : MVT::integer_valuetypes()) {
31121 unsigned ShiftSize = SVT.getSizeInBits();
31122 // skipping types without corresponding sext/zext and
31123 // ShlConst that is not one of [56,48,32,24,16]
31124 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31128 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31129 SarConst = SarConst - (Size - ShiftSize);
31132 else if (SarConst.isNegative())
31133 return DAG.getNode(ISD::SHL, DL, VT, NN,
31134 DAG.getConstant(-SarConst, DL, CVT));
31136 return DAG.getNode(ISD::SRA, DL, VT, NN,
31137 DAG.getConstant(SarConst, DL, CVT));
31142 /// \brief Returns a vector of 0s if the node in input is a vector logical
31143 /// shift by a constant amount which is known to be bigger than or equal
31144 /// to the vector element size in bits.
31145 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31146 const X86Subtarget &Subtarget) {
31147 EVT VT = N->getValueType(0);
31149 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31150 (!Subtarget.hasInt256() ||
31151 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31154 SDValue Amt = N->getOperand(1);
31156 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31157 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31158 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31159 unsigned MaxAmount =
31160 VT.getSimpleVT().getScalarSizeInBits();
31162 // SSE2/AVX2 logical shifts always return a vector of 0s
31163 // if the shift amount is bigger than or equal to
31164 // the element size. The constant shift amount will be
31165 // encoded as a 8-bit immediate.
31166 if (ShiftAmt.trunc(8).uge(MaxAmount))
31167 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31173 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31174 TargetLowering::DAGCombinerInfo &DCI,
31175 const X86Subtarget &Subtarget) {
31176 if (N->getOpcode() == ISD::SHL)
31177 if (SDValue V = combineShiftLeft(N, DAG))
31180 if (N->getOpcode() == ISD::SRA)
31181 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31184 // Try to fold this logical shift into a zero vector.
31185 if (N->getOpcode() != ISD::SRA)
31186 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31192 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31193 TargetLowering::DAGCombinerInfo &DCI,
31194 const X86Subtarget &Subtarget) {
31195 unsigned Opcode = N->getOpcode();
31196 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31197 X86ISD::VSRLI == Opcode) &&
31198 "Unexpected shift opcode");
31199 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31200 EVT VT = N->getValueType(0);
31201 SDValue N0 = N->getOperand(0);
31202 SDValue N1 = N->getOperand(1);
31203 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31204 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31205 "Unexpected value type");
31207 // Out of range logical bit shifts are guaranteed to be zero.
31208 // Out of range arithmetic bit shifts splat the sign bit.
31209 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31210 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31212 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31214 ShiftVal = NumBitsPerElt - 1;
31217 // Shift N0 by zero -> N0.
31221 // Shift zero -> zero.
31222 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31223 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31225 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31226 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31227 // TODO - support other sra opcodes as needed.
31228 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31229 N0.getOpcode() == X86ISD::VSRAI)
31230 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31232 // We can decode 'whole byte' logical bit shifts as shuffles.
31233 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31235 SmallVector<int, 1> NonceMask; // Just a placeholder.
31236 NonceMask.push_back(0);
31237 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31238 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31240 return SDValue(); // This routine will use CombineTo to replace N.
31243 // Constant Folding.
31245 SmallVector<APInt, 32> EltBits;
31246 if (N->isOnlyUserOf(N0.getNode()) &&
31247 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31248 assert(EltBits.size() == VT.getVectorNumElements() &&
31249 "Unexpected shift value type");
31250 unsigned ShiftImm = ShiftVal.getZExtValue();
31251 for (APInt &Elt : EltBits) {
31252 if (X86ISD::VSHLI == Opcode)
31254 else if (X86ISD::VSRAI == Opcode)
31255 Elt.ashrInPlace(ShiftImm);
31257 Elt.lshrInPlace(ShiftImm);
31259 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31265 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31266 TargetLowering::DAGCombinerInfo &DCI,
31267 const X86Subtarget &Subtarget) {
31269 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31270 (N->getOpcode() == X86ISD::PINSRW &&
31271 N->getValueType(0) == MVT::v8i16)) &&
31272 "Unexpected vector insertion");
31274 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31276 SmallVector<int, 1> NonceMask; // Just a placeholder.
31277 NonceMask.push_back(0);
31278 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31279 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31284 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31285 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31286 /// OR -> CMPNEQSS.
31287 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31288 TargetLowering::DAGCombinerInfo &DCI,
31289 const X86Subtarget &Subtarget) {
31292 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31293 // we're requiring SSE2 for both.
31294 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31295 SDValue N0 = N->getOperand(0);
31296 SDValue N1 = N->getOperand(1);
31297 SDValue CMP0 = N0->getOperand(1);
31298 SDValue CMP1 = N1->getOperand(1);
31301 // The SETCCs should both refer to the same CMP.
31302 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31305 SDValue CMP00 = CMP0->getOperand(0);
31306 SDValue CMP01 = CMP0->getOperand(1);
31307 EVT VT = CMP00.getValueType();
31309 if (VT == MVT::f32 || VT == MVT::f64) {
31310 bool ExpectingFlags = false;
31311 // Check for any users that want flags:
31312 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31313 !ExpectingFlags && UI != UE; ++UI)
31314 switch (UI->getOpcode()) {
31319 ExpectingFlags = true;
31321 case ISD::CopyToReg:
31322 case ISD::SIGN_EXTEND:
31323 case ISD::ZERO_EXTEND:
31324 case ISD::ANY_EXTEND:
31328 if (!ExpectingFlags) {
31329 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31330 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31332 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31333 X86::CondCode tmp = cc0;
31338 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31339 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31340 // FIXME: need symbolic constants for these magic numbers.
31341 // See X86ATTInstPrinter.cpp:printSSECC().
31342 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31343 if (Subtarget.hasAVX512()) {
31345 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31346 DAG.getConstant(x86cc, DL, MVT::i8));
31347 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31348 FSetCC, DAG.getIntPtrConstant(0, DL));
31350 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31351 CMP00.getValueType(), CMP00, CMP01,
31352 DAG.getConstant(x86cc, DL,
31355 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31356 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31358 if (is64BitFP && !Subtarget.is64Bit()) {
31359 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31360 // 64-bit integer, since that's not a legal type. Since
31361 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31362 // bits, but can do this little dance to extract the lowest 32 bits
31363 // and work with those going forward.
31364 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31366 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31367 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31368 Vector32, DAG.getIntPtrConstant(0, DL));
31372 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31373 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31374 DAG.getConstant(1, DL, IntVT));
31375 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31377 return OneBitOfTruth;
31385 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31386 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31387 assert(N->getOpcode() == ISD::AND);
31389 EVT VT = N->getValueType(0);
31390 SDValue N0 = N->getOperand(0);
31391 SDValue N1 = N->getOperand(1);
31394 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31397 if (N0.getOpcode() == ISD::XOR &&
31398 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31399 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31401 if (N1.getOpcode() == ISD::XOR &&
31402 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31403 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31408 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31409 // register. In most cases we actually compare or select YMM-sized registers
31410 // and mixing the two types creates horrible code. This method optimizes
31411 // some of the transition sequences.
31412 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31413 TargetLowering::DAGCombinerInfo &DCI,
31414 const X86Subtarget &Subtarget) {
31415 EVT VT = N->getValueType(0);
31416 if (!VT.is256BitVector())
31419 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31420 N->getOpcode() == ISD::ZERO_EXTEND ||
31421 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31423 SDValue Narrow = N->getOperand(0);
31424 EVT NarrowVT = Narrow->getValueType(0);
31425 if (!NarrowVT.is128BitVector())
31428 if (Narrow->getOpcode() != ISD::XOR &&
31429 Narrow->getOpcode() != ISD::AND &&
31430 Narrow->getOpcode() != ISD::OR)
31433 SDValue N0 = Narrow->getOperand(0);
31434 SDValue N1 = Narrow->getOperand(1);
31437 // The Left side has to be a trunc.
31438 if (N0.getOpcode() != ISD::TRUNCATE)
31441 // The type of the truncated inputs.
31442 EVT WideVT = N0->getOperand(0)->getValueType(0);
31446 // The right side has to be a 'trunc' or a constant vector.
31447 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31448 ConstantSDNode *RHSConstSplat = nullptr;
31449 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31450 RHSConstSplat = RHSBV->getConstantSplatNode();
31451 if (!RHSTrunc && !RHSConstSplat)
31454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31456 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31459 // Set N0 and N1 to hold the inputs to the new wide operation.
31460 N0 = N0->getOperand(0);
31461 if (RHSConstSplat) {
31462 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31463 SDValue(RHSConstSplat, 0));
31464 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31465 } else if (RHSTrunc) {
31466 N1 = N1->getOperand(0);
31469 // Generate the wide operation.
31470 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31471 unsigned Opcode = N->getOpcode();
31473 case ISD::ANY_EXTEND:
31475 case ISD::ZERO_EXTEND: {
31476 unsigned InBits = NarrowVT.getScalarSizeInBits();
31477 APInt Mask = APInt::getAllOnesValue(InBits);
31478 Mask = Mask.zext(VT.getScalarSizeInBits());
31479 return DAG.getNode(ISD::AND, DL, VT,
31480 Op, DAG.getConstant(Mask, DL, VT));
31482 case ISD::SIGN_EXTEND:
31483 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31484 Op, DAG.getValueType(NarrowVT));
31486 llvm_unreachable("Unexpected opcode");
31490 /// If both input operands of a logic op are being cast from floating point
31491 /// types, try to convert this into a floating point logic node to avoid
31492 /// unnecessary moves from SSE to integer registers.
31493 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31494 const X86Subtarget &Subtarget) {
31495 unsigned FPOpcode = ISD::DELETED_NODE;
31496 if (N->getOpcode() == ISD::AND)
31497 FPOpcode = X86ISD::FAND;
31498 else if (N->getOpcode() == ISD::OR)
31499 FPOpcode = X86ISD::FOR;
31500 else if (N->getOpcode() == ISD::XOR)
31501 FPOpcode = X86ISD::FXOR;
31503 assert(FPOpcode != ISD::DELETED_NODE &&
31504 "Unexpected input node for FP logic conversion");
31506 EVT VT = N->getValueType(0);
31507 SDValue N0 = N->getOperand(0);
31508 SDValue N1 = N->getOperand(1);
31510 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31511 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31512 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31513 SDValue N00 = N0.getOperand(0);
31514 SDValue N10 = N1.getOperand(0);
31515 EVT N00Type = N00.getValueType();
31516 EVT N10Type = N10.getValueType();
31517 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31518 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31519 return DAG.getBitcast(VT, FPLogic);
31525 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31526 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31527 /// with a shift-right to eliminate loading the vector constant mask value.
31528 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31529 const X86Subtarget &Subtarget) {
31530 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31531 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31532 EVT VT0 = Op0.getValueType();
31533 EVT VT1 = Op1.getValueType();
31535 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31539 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31540 !SplatVal.isMask())
31543 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31546 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31547 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31551 unsigned ShiftVal = SplatVal.countTrailingOnes();
31552 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31553 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31554 return DAG.getBitcast(N->getValueType(0), Shift);
31557 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31558 TargetLowering::DAGCombinerInfo &DCI,
31559 const X86Subtarget &Subtarget) {
31560 if (DCI.isBeforeLegalizeOps())
31563 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31566 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31569 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31572 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31575 EVT VT = N->getValueType(0);
31576 SDValue N0 = N->getOperand(0);
31577 SDValue N1 = N->getOperand(1);
31580 // Attempt to recursively combine a bitmask AND with shuffles.
31581 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31583 SmallVector<int, 1> NonceMask; // Just a placeholder.
31584 NonceMask.push_back(0);
31585 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31586 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31588 return SDValue(); // This routine will use CombineTo to replace N.
31591 // Create BEXTR instructions
31592 // BEXTR is ((X >> imm) & (2**size-1))
31593 if (VT != MVT::i32 && VT != MVT::i64)
31596 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31598 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31601 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31602 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31603 if (MaskNode && ShiftNode) {
31604 uint64_t Mask = MaskNode->getZExtValue();
31605 uint64_t Shift = ShiftNode->getZExtValue();
31606 if (isMask_64(Mask)) {
31607 uint64_t MaskSize = countPopulation(Mask);
31608 if (Shift + MaskSize <= VT.getSizeInBits())
31609 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31610 DAG.getConstant(Shift | (MaskSize << 8), DL,
31618 // (or (and (m, y), (pandn m, x)))
31620 // (vselect m, x, y)
31621 // As a special case, try to fold:
31622 // (or (and (m, (sub 0, x)), (pandn m, x)))
31624 // (sub (xor X, M), M)
31625 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31626 const X86Subtarget &Subtarget) {
31627 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
31629 SDValue N0 = N->getOperand(0);
31630 SDValue N1 = N->getOperand(1);
31631 EVT VT = N->getValueType(0);
31633 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
31634 (VT.is256BitVector() && Subtarget.hasInt256())))
31637 // Canonicalize AND to LHS.
31638 if (N1.getOpcode() == ISD::AND)
31641 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
31642 // ANDNP combine allows other combines to happen that prevent matching.
31643 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31646 SDValue Mask = N1.getOperand(0);
31647 SDValue X = N1.getOperand(1);
31649 if (N0.getOperand(0) == Mask)
31650 Y = N0.getOperand(1);
31651 if (N0.getOperand(1) == Mask)
31652 Y = N0.getOperand(0);
31654 // Check to see if the mask appeared in both the AND and ANDNP.
31658 // Validate that X, Y, and Mask are bitcasts, and see through them.
31659 Mask = peekThroughBitcasts(Mask);
31660 X = peekThroughBitcasts(X);
31661 Y = peekThroughBitcasts(Y);
31663 EVT MaskVT = Mask.getValueType();
31664 unsigned EltBits = MaskVT.getScalarSizeInBits();
31666 // TODO: Attempt to handle floating point cases as well?
31667 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
31673 // (or (and (M, (sub 0, X)), (pandn M, X)))
31674 // which is a special case of vselect:
31675 // (vselect M, (sub 0, X), X)
31677 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31678 // We know that, if fNegate is 0 or 1:
31679 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31681 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31682 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31683 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31684 // This lets us transform our vselect to:
31685 // (add (xor X, M), (and M, 1))
31687 // (sub (xor X, M), M)
31688 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
31689 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
31690 auto IsNegV = [](SDNode *N, SDValue V) {
31691 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31692 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31695 if (IsNegV(Y.getNode(), X))
31697 else if (IsNegV(X.getNode(), Y))
31701 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31702 SDValue SubOp2 = Mask;
31704 // If the negate was on the false side of the select, then
31705 // the operands of the SUB need to be swapped. PR 27251.
31706 // This is because the pattern being matched above is
31707 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31708 // but if the pattern matched was
31709 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31710 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31711 // pattern also needs to be a negation of the replacement pattern above.
31712 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31713 // sub accomplishes the negation of the replacement pattern.
31715 std::swap(SubOp1, SubOp2);
31717 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
31718 return DAG.getBitcast(VT, Res);
31722 // PBLENDVB is only available on SSE 4.1.
31723 if (!Subtarget.hasSSE41())
31726 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31728 X = DAG.getBitcast(BlendVT, X);
31729 Y = DAG.getBitcast(BlendVT, Y);
31730 Mask = DAG.getBitcast(BlendVT, Mask);
31731 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
31732 return DAG.getBitcast(VT, Mask);
31735 // Helper function for combineOrCmpEqZeroToCtlzSrl
31739 // srl(ctlz x), log2(bitsize(x))
31740 // Input pattern is checked by caller.
31741 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31742 SelectionDAG &DAG) {
31743 SDValue Cmp = Op.getOperand(1);
31744 EVT VT = Cmp.getOperand(0).getValueType();
31745 unsigned Log2b = Log2_32(VT.getSizeInBits());
31747 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31748 // The result of the shift is true or false, and on X86, the 32-bit
31749 // encoding of shr and lzcnt is more desirable.
31750 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31751 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31752 DAG.getConstant(Log2b, dl, VT));
31753 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31756 // Try to transform:
31757 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31759 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31760 // Will also attempt to match more generic cases, eg:
31761 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31762 // Only applies if the target supports the FastLZCNT feature.
31763 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31764 TargetLowering::DAGCombinerInfo &DCI,
31765 const X86Subtarget &Subtarget) {
31766 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31769 auto isORCandidate = [](SDValue N) {
31770 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31773 // Check the zero extend is extending to 32-bit or more. The code generated by
31774 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31775 // instructions to clear the upper bits.
31776 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31777 !isORCandidate(N->getOperand(0)))
31780 // Check the node matches: setcc(eq, cmp 0)
31781 auto isSetCCCandidate = [](SDValue N) {
31782 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31783 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31784 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31785 isNullConstant(N->getOperand(1).getOperand(1)) &&
31786 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31789 SDNode *OR = N->getOperand(0).getNode();
31790 SDValue LHS = OR->getOperand(0);
31791 SDValue RHS = OR->getOperand(1);
31793 // Save nodes matching or(or, setcc(eq, cmp 0)).
31794 SmallVector<SDNode *, 2> ORNodes;
31795 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31796 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31797 ORNodes.push_back(OR);
31798 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31799 LHS = OR->getOperand(0);
31800 RHS = OR->getOperand(1);
31803 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31804 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31805 !isORCandidate(SDValue(OR, 0)))
31808 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31810 // or(srl(ctlz),srl(ctlz)).
31811 // The dag combiner can then fold it into:
31812 // srl(or(ctlz, ctlz)).
31813 EVT VT = OR->getValueType(0);
31814 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31815 SDValue Ret, NewRHS;
31816 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31817 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31822 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31823 while (ORNodes.size() > 0) {
31824 OR = ORNodes.pop_back_val();
31825 LHS = OR->getOperand(0);
31826 RHS = OR->getOperand(1);
31827 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31828 if (RHS->getOpcode() == ISD::OR)
31829 std::swap(LHS, RHS);
31830 EVT VT = OR->getValueType(0);
31831 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31834 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31838 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31843 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31844 TargetLowering::DAGCombinerInfo &DCI,
31845 const X86Subtarget &Subtarget) {
31846 if (DCI.isBeforeLegalizeOps())
31849 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31852 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31855 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31858 SDValue N0 = N->getOperand(0);
31859 SDValue N1 = N->getOperand(1);
31860 EVT VT = N->getValueType(0);
31862 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31865 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31866 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31868 // SHLD/SHRD instructions have lower register pressure, but on some
31869 // platforms they have higher latency than the equivalent
31870 // series of shifts/or that would otherwise be generated.
31871 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31872 // have higher latencies and we are not optimizing for size.
31873 if (!OptForSize && Subtarget.isSHLDSlow())
31876 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31878 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31880 if (!N0.hasOneUse() || !N1.hasOneUse())
31883 SDValue ShAmt0 = N0.getOperand(1);
31884 if (ShAmt0.getValueType() != MVT::i8)
31886 SDValue ShAmt1 = N1.getOperand(1);
31887 if (ShAmt1.getValueType() != MVT::i8)
31889 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31890 ShAmt0 = ShAmt0.getOperand(0);
31891 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31892 ShAmt1 = ShAmt1.getOperand(0);
31895 unsigned Opc = X86ISD::SHLD;
31896 SDValue Op0 = N0.getOperand(0);
31897 SDValue Op1 = N1.getOperand(0);
31898 if (ShAmt0.getOpcode() == ISD::SUB ||
31899 ShAmt0.getOpcode() == ISD::XOR) {
31900 Opc = X86ISD::SHRD;
31901 std::swap(Op0, Op1);
31902 std::swap(ShAmt0, ShAmt1);
31905 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31906 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31907 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31908 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31909 unsigned Bits = VT.getSizeInBits();
31910 if (ShAmt1.getOpcode() == ISD::SUB) {
31911 SDValue Sum = ShAmt1.getOperand(0);
31912 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31913 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31914 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31915 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
31916 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
31917 return DAG.getNode(Opc, DL, VT,
31919 DAG.getNode(ISD::TRUNCATE, DL,
31922 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
31923 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
31924 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
31925 return DAG.getNode(Opc, DL, VT,
31926 N0.getOperand(0), N1.getOperand(0),
31927 DAG.getNode(ISD::TRUNCATE, DL,
31929 } else if (ShAmt1.getOpcode() == ISD::XOR) {
31930 SDValue Mask = ShAmt1.getOperand(1);
31931 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
31932 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
31933 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
31934 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
31935 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
31936 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
31937 if (Op1.getOpcode() == InnerShift &&
31938 isa<ConstantSDNode>(Op1.getOperand(1)) &&
31939 Op1.getConstantOperandVal(1) == 1) {
31940 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31941 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31943 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
31944 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
31945 Op1.getOperand(0) == Op1.getOperand(1)) {
31946 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31947 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31956 /// Generate NEG and CMOV for integer abs.
31957 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
31958 EVT VT = N->getValueType(0);
31960 // Since X86 does not have CMOV for 8-bit integer, we don't convert
31961 // 8-bit integer abs to NEG and CMOV.
31962 if (VT.isInteger() && VT.getSizeInBits() == 8)
31965 SDValue N0 = N->getOperand(0);
31966 SDValue N1 = N->getOperand(1);
31969 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
31970 // and change it to SUB and CMOV.
31971 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
31972 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
31973 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
31974 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
31975 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
31976 // Generate SUB & CMOV.
31977 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
31978 DAG.getConstant(0, DL, VT), N0.getOperand(0));
31979 SDValue Ops[] = {N0.getOperand(0), Neg,
31980 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
31981 SDValue(Neg.getNode(), 1)};
31982 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
31988 /// Try to turn tests against the signbit in the form of:
31989 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
31992 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
31993 // This is only worth doing if the output type is i8 or i1.
31994 EVT ResultType = N->getValueType(0);
31995 if (ResultType != MVT::i8 && ResultType != MVT::i1)
31998 SDValue N0 = N->getOperand(0);
31999 SDValue N1 = N->getOperand(1);
32001 // We should be performing an xor against a truncated shift.
32002 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32005 // Make sure we are performing an xor against one.
32006 if (!isOneConstant(N1))
32009 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32010 SDValue Shift = N0.getOperand(0);
32011 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32014 // Make sure we are truncating from one of i16, i32 or i64.
32015 EVT ShiftTy = Shift.getValueType();
32016 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32019 // Make sure the shift amount extracts the sign bit.
32020 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32021 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32024 // Create a greater-than comparison against -1.
32025 // N.B. Using SETGE against 0 works but we want a canonical looking
32026 // comparison, using SETGT matches up with what TranslateX86CC.
32028 SDValue ShiftOp = Shift.getOperand(0);
32029 EVT ShiftOpTy = ShiftOp.getValueType();
32030 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32031 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32032 *DAG.getContext(), ResultType);
32033 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32034 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32035 if (SetCCResultType != ResultType)
32036 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32040 /// Turn vector tests of the signbit in the form of:
32041 /// xor (sra X, elt_size(X)-1), -1
32045 /// This should be called before type legalization because the pattern may not
32046 /// persist after that.
32047 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32048 const X86Subtarget &Subtarget) {
32049 EVT VT = N->getValueType(0);
32050 if (!VT.isSimple())
32053 switch (VT.getSimpleVT().SimpleTy) {
32054 default: return SDValue();
32057 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32058 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32062 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32065 // There must be a shift right algebraic before the xor, and the xor must be a
32066 // 'not' operation.
32067 SDValue Shift = N->getOperand(0);
32068 SDValue Ones = N->getOperand(1);
32069 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32070 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32073 // The shift should be smearing the sign bit across each vector element.
32074 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32078 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32079 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32080 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32083 // Create a greater-than comparison against -1. We don't use the more obvious
32084 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32085 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32088 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32089 /// is valid for the given \p Subtarget.
32090 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32091 const X86Subtarget &Subtarget) {
32092 if (!Subtarget.hasAVX512())
32095 // FIXME: Scalar type may be supported if we move it to vector register.
32096 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32099 EVT SrcElVT = SrcVT.getScalarType();
32100 EVT DstElVT = DstVT.getScalarType();
32101 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32103 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32105 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32106 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32110 /// Detect a pattern of truncation with saturation:
32111 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32112 /// Return the source value to be truncated or SDValue() if the pattern was not
32114 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32115 if (In.getOpcode() != ISD::UMIN)
32118 //Saturation with truncation. We truncate from InVT to VT.
32119 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32120 "Unexpected types for truncate operation");
32123 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32124 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32125 // the element size of the destination type.
32126 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32132 /// Detect a pattern of truncation with saturation:
32133 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32134 /// The types should allow to use VPMOVUS* instruction on AVX512.
32135 /// Return the source value to be truncated or SDValue() if the pattern was not
32137 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32138 const X86Subtarget &Subtarget) {
32139 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32141 return detectUSatPattern(In, VT);
32145 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32146 const X86Subtarget &Subtarget) {
32147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32148 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32150 if (auto USatVal = detectUSatPattern(In, VT))
32151 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32152 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32156 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32157 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32158 /// X86ISD::AVG instruction.
32159 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32160 const X86Subtarget &Subtarget,
32162 if (!VT.isVector() || !VT.isSimple())
32164 EVT InVT = In.getValueType();
32165 unsigned NumElems = VT.getVectorNumElements();
32167 EVT ScalarVT = VT.getVectorElementType();
32168 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32169 isPowerOf2_32(NumElems)))
32172 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32173 // than the original input type (i8/i16).
32174 EVT InScalarVT = InVT.getVectorElementType();
32175 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32178 if (!Subtarget.hasSSE2())
32180 if (Subtarget.hasBWI()) {
32181 if (VT.getSizeInBits() > 512)
32183 } else if (Subtarget.hasAVX2()) {
32184 if (VT.getSizeInBits() > 256)
32187 if (VT.getSizeInBits() > 128)
32191 // Detect the following pattern:
32193 // %1 = zext <N x i8> %a to <N x i32>
32194 // %2 = zext <N x i8> %b to <N x i32>
32195 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32196 // %4 = add nuw nsw <N x i32> %3, %2
32197 // %5 = lshr <N x i32> %N, <i32 1 x N>
32198 // %6 = trunc <N x i32> %5 to <N x i8>
32200 // In AVX512, the last instruction can also be a trunc store.
32202 if (In.getOpcode() != ISD::SRL)
32205 // A lambda checking the given SDValue is a constant vector and each element
32206 // is in the range [Min, Max].
32207 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32208 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32209 if (!BV || !BV->isConstant())
32211 for (SDValue Op : V->ops()) {
32212 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32215 uint64_t Val = C->getZExtValue();
32216 if (Val < Min || Val > Max)
32222 // Check if each element of the vector is left-shifted by one.
32223 auto LHS = In.getOperand(0);
32224 auto RHS = In.getOperand(1);
32225 if (!IsConstVectorInRange(RHS, 1, 1))
32227 if (LHS.getOpcode() != ISD::ADD)
32230 // Detect a pattern of a + b + 1 where the order doesn't matter.
32231 SDValue Operands[3];
32232 Operands[0] = LHS.getOperand(0);
32233 Operands[1] = LHS.getOperand(1);
32235 // Take care of the case when one of the operands is a constant vector whose
32236 // element is in the range [1, 256].
32237 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32238 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32239 Operands[0].getOperand(0).getValueType() == VT) {
32240 // The pattern is detected. Subtract one from the constant vector, then
32241 // demote it and emit X86ISD::AVG instruction.
32242 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32243 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32244 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32245 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32249 if (Operands[0].getOpcode() == ISD::ADD)
32250 std::swap(Operands[0], Operands[1]);
32251 else if (Operands[1].getOpcode() != ISD::ADD)
32253 Operands[2] = Operands[1].getOperand(0);
32254 Operands[1] = Operands[1].getOperand(1);
32256 // Now we have three operands of two additions. Check that one of them is a
32257 // constant vector with ones, and the other two are promoted from i8/i16.
32258 for (int i = 0; i < 3; ++i) {
32259 if (!IsConstVectorInRange(Operands[i], 1, 1))
32261 std::swap(Operands[i], Operands[2]);
32263 // Check if Operands[0] and Operands[1] are results of type promotion.
32264 for (int j = 0; j < 2; ++j)
32265 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32266 Operands[j].getOperand(0).getValueType() != VT)
32269 // The pattern is detected, emit X86ISD::AVG instruction.
32270 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32271 Operands[1].getOperand(0));
32277 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32278 TargetLowering::DAGCombinerInfo &DCI,
32279 const X86Subtarget &Subtarget) {
32280 LoadSDNode *Ld = cast<LoadSDNode>(N);
32281 EVT RegVT = Ld->getValueType(0);
32282 EVT MemVT = Ld->getMemoryVT();
32284 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32286 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32287 // into two 16-byte operations.
32288 ISD::LoadExtType Ext = Ld->getExtensionType();
32290 unsigned AddressSpace = Ld->getAddressSpace();
32291 unsigned Alignment = Ld->getAlignment();
32292 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32293 Ext == ISD::NON_EXTLOAD &&
32294 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32295 AddressSpace, Alignment, &Fast) && !Fast) {
32296 unsigned NumElems = RegVT.getVectorNumElements();
32300 SDValue Ptr = Ld->getBasePtr();
32302 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32305 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32306 Alignment, Ld->getMemOperand()->getFlags());
32308 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32310 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32311 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32312 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32314 Load2.getValue(1));
32316 SDValue NewVec = DAG.getUNDEF(RegVT);
32317 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32318 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32319 return DCI.CombineTo(N, NewVec, TF, true);
32325 /// If V is a build vector of boolean constants and exactly one of those
32326 /// constants is true, return the operand index of that true element.
32327 /// Otherwise, return -1.
32328 static int getOneTrueElt(SDValue V) {
32329 // This needs to be a build vector of booleans.
32330 // TODO: Checking for the i1 type matches the IR definition for the mask,
32331 // but the mask check could be loosened to i8 or other types. That might
32332 // also require checking more than 'allOnesValue'; eg, the x86 HW
32333 // instructions only require that the MSB is set for each mask element.
32334 // The ISD::MSTORE comments/definition do not specify how the mask operand
32336 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32337 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32340 int TrueIndex = -1;
32341 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32342 for (unsigned i = 0; i < NumElts; ++i) {
32343 const SDValue &Op = BV->getOperand(i);
32346 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32349 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32350 // If we already found a one, this is too many.
32351 if (TrueIndex >= 0)
32359 /// Given a masked memory load/store operation, return true if it has one mask
32360 /// bit set. If it has one mask bit set, then also return the memory address of
32361 /// the scalar element to load/store, the vector index to insert/extract that
32362 /// scalar element, and the alignment for the scalar memory access.
32363 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32364 SelectionDAG &DAG, SDValue &Addr,
32365 SDValue &Index, unsigned &Alignment) {
32366 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32367 if (TrueMaskElt < 0)
32370 // Get the address of the one scalar element that is specified by the mask
32371 // using the appropriate offset from the base pointer.
32372 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32373 Addr = MaskedOp->getBasePtr();
32374 if (TrueMaskElt != 0) {
32375 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32376 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32379 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32380 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32384 /// If exactly one element of the mask is set for a non-extending masked load,
32385 /// it is a scalar load and vector insert.
32386 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32387 /// mask have already been optimized in IR, so we don't bother with those here.
32389 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32390 TargetLowering::DAGCombinerInfo &DCI) {
32391 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32392 // However, some target hooks may need to be added to know when the transform
32393 // is profitable. Endianness would also have to be considered.
32395 SDValue Addr, VecIndex;
32396 unsigned Alignment;
32397 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32400 // Load the one scalar element that is specified by the mask using the
32401 // appropriate offset from the base pointer.
32403 EVT VT = ML->getValueType(0);
32404 EVT EltVT = VT.getVectorElementType();
32406 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32407 Alignment, ML->getMemOperand()->getFlags());
32409 // Insert the loaded element into the appropriate place in the vector.
32410 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32412 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32416 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32417 TargetLowering::DAGCombinerInfo &DCI) {
32418 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32422 EVT VT = ML->getValueType(0);
32424 // If we are loading the first and last elements of a vector, it is safe and
32425 // always faster to load the whole vector. Replace the masked load with a
32426 // vector load and select.
32427 unsigned NumElts = VT.getVectorNumElements();
32428 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32429 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32430 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32431 if (LoadFirstElt && LoadLastElt) {
32432 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32433 ML->getMemOperand());
32434 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32435 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32438 // Convert a masked load with a constant mask into a masked load and a select.
32439 // This allows the select operation to use a faster kind of select instruction
32440 // (for example, vblendvps -> vblendps).
32442 // Don't try this if the pass-through operand is already undefined. That would
32443 // cause an infinite loop because that's what we're about to create.
32444 if (ML->getSrc0().isUndef())
32447 // The new masked load has an undef pass-through operand. The select uses the
32448 // original pass-through operand.
32449 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32450 ML->getMask(), DAG.getUNDEF(VT),
32451 ML->getMemoryVT(), ML->getMemOperand(),
32452 ML->getExtensionType());
32453 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32455 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32458 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32459 TargetLowering::DAGCombinerInfo &DCI,
32460 const X86Subtarget &Subtarget) {
32461 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32463 // TODO: Expanding load with constant mask may be optimized as well.
32464 if (Mld->isExpandingLoad())
32467 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32468 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32470 // TODO: Do some AVX512 subsets benefit from this transform?
32471 if (!Subtarget.hasAVX512())
32472 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32476 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32479 // Resolve extending loads.
32480 EVT VT = Mld->getValueType(0);
32481 unsigned NumElems = VT.getVectorNumElements();
32482 EVT LdVT = Mld->getMemoryVT();
32485 assert(LdVT != VT && "Cannot extend to the same type");
32486 unsigned ToSz = VT.getScalarSizeInBits();
32487 unsigned FromSz = LdVT.getScalarSizeInBits();
32488 // From/To sizes and ElemCount must be pow of two.
32489 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32490 "Unexpected size for extending masked load");
32492 unsigned SizeRatio = ToSz / FromSz;
32493 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32495 // Create a type on which we perform the shuffle.
32496 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32497 LdVT.getScalarType(), NumElems*SizeRatio);
32498 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32500 // Convert Src0 value.
32501 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32502 if (!Mld->getSrc0().isUndef()) {
32503 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32504 for (unsigned i = 0; i != NumElems; ++i)
32505 ShuffleVec[i] = i * SizeRatio;
32507 // Can't shuffle using an illegal type.
32508 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32509 "WideVecVT should be legal");
32510 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32511 DAG.getUNDEF(WideVecVT), ShuffleVec);
32513 // Prepare the new mask.
32515 SDValue Mask = Mld->getMask();
32516 if (Mask.getValueType() == VT) {
32517 // Mask and original value have the same type.
32518 NewMask = DAG.getBitcast(WideVecVT, Mask);
32519 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32520 for (unsigned i = 0; i != NumElems; ++i)
32521 ShuffleVec[i] = i * SizeRatio;
32522 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32523 ShuffleVec[i] = NumElems * SizeRatio;
32524 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32525 DAG.getConstant(0, dl, WideVecVT),
32528 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32529 unsigned WidenNumElts = NumElems*SizeRatio;
32530 unsigned MaskNumElts = VT.getVectorNumElements();
32531 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32534 unsigned NumConcat = WidenNumElts / MaskNumElts;
32535 SmallVector<SDValue, 16> Ops(NumConcat);
32536 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32538 for (unsigned i = 1; i != NumConcat; ++i)
32541 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32544 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32545 Mld->getBasePtr(), NewMask, WideSrc0,
32546 Mld->getMemoryVT(), Mld->getMemOperand(),
32548 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32549 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32552 /// If exactly one element of the mask is set for a non-truncating masked store,
32553 /// it is a vector extract and scalar store.
32554 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32555 /// mask have already been optimized in IR, so we don't bother with those here.
32556 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32557 SelectionDAG &DAG) {
32558 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32559 // However, some target hooks may need to be added to know when the transform
32560 // is profitable. Endianness would also have to be considered.
32562 SDValue Addr, VecIndex;
32563 unsigned Alignment;
32564 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32567 // Extract the one scalar element that is actually being stored.
32569 EVT VT = MS->getValue().getValueType();
32570 EVT EltVT = VT.getVectorElementType();
32571 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32572 MS->getValue(), VecIndex);
32574 // Store that element at the appropriate offset from the base pointer.
32575 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32576 Alignment, MS->getMemOperand()->getFlags());
32579 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32580 const X86Subtarget &Subtarget) {
32581 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32583 if (Mst->isCompressingStore())
32586 if (!Mst->isTruncatingStore())
32587 return reduceMaskedStoreToScalarStore(Mst, DAG);
32589 // Resolve truncating stores.
32590 EVT VT = Mst->getValue().getValueType();
32591 unsigned NumElems = VT.getVectorNumElements();
32592 EVT StVT = Mst->getMemoryVT();
32595 assert(StVT != VT && "Cannot truncate to the same type");
32596 unsigned FromSz = VT.getScalarSizeInBits();
32597 unsigned ToSz = StVT.getScalarSizeInBits();
32599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32601 // The truncating store is legal in some cases. For example
32602 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32603 // are designated for truncate store.
32604 // In this case we don't need any further transformations.
32605 if (TLI.isTruncStoreLegal(VT, StVT))
32608 // From/To sizes and ElemCount must be pow of two.
32609 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32610 "Unexpected size for truncating masked store");
32611 // We are going to use the original vector elt for storing.
32612 // Accumulated smaller vector elements must be a multiple of the store size.
32613 assert (((NumElems * FromSz) % ToSz) == 0 &&
32614 "Unexpected ratio for truncating masked store");
32616 unsigned SizeRatio = FromSz / ToSz;
32617 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32619 // Create a type on which we perform the shuffle.
32620 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32621 StVT.getScalarType(), NumElems*SizeRatio);
32623 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32625 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32626 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32627 for (unsigned i = 0; i != NumElems; ++i)
32628 ShuffleVec[i] = i * SizeRatio;
32630 // Can't shuffle using an illegal type.
32631 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32632 "WideVecVT should be legal");
32634 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32635 DAG.getUNDEF(WideVecVT),
32639 SDValue Mask = Mst->getMask();
32640 if (Mask.getValueType() == VT) {
32641 // Mask and original value have the same type.
32642 NewMask = DAG.getBitcast(WideVecVT, Mask);
32643 for (unsigned i = 0; i != NumElems; ++i)
32644 ShuffleVec[i] = i * SizeRatio;
32645 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32646 ShuffleVec[i] = NumElems*SizeRatio;
32647 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32648 DAG.getConstant(0, dl, WideVecVT),
32651 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32652 unsigned WidenNumElts = NumElems*SizeRatio;
32653 unsigned MaskNumElts = VT.getVectorNumElements();
32654 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32657 unsigned NumConcat = WidenNumElts / MaskNumElts;
32658 SmallVector<SDValue, 16> Ops(NumConcat);
32659 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32661 for (unsigned i = 1; i != NumConcat; ++i)
32664 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32667 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32668 Mst->getBasePtr(), NewMask, StVT,
32669 Mst->getMemOperand(), false);
32672 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32673 const X86Subtarget &Subtarget) {
32674 StoreSDNode *St = cast<StoreSDNode>(N);
32675 EVT VT = St->getValue().getValueType();
32676 EVT StVT = St->getMemoryVT();
32678 SDValue StoredVal = St->getOperand(1);
32679 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32681 // If we are saving a concatenation of two XMM registers and 32-byte stores
32682 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32684 unsigned AddressSpace = St->getAddressSpace();
32685 unsigned Alignment = St->getAlignment();
32686 if (VT.is256BitVector() && StVT == VT &&
32687 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32688 AddressSpace, Alignment, &Fast) &&
32690 unsigned NumElems = VT.getVectorNumElements();
32694 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32695 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32697 SDValue Ptr0 = St->getBasePtr();
32698 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32701 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32702 Alignment, St->getMemOperand()->getFlags());
32704 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32705 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32706 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32709 // Optimize trunc store (of multiple scalars) to shuffle and store.
32710 // First, pack all of the elements in one place. Next, store to memory
32711 // in fewer chunks.
32712 if (St->isTruncatingStore() && VT.isVector()) {
32713 // Check if we can detect an AVG pattern from the truncation. If yes,
32714 // replace the trunc store by a normal store with the result of X86ISD::AVG
32716 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32718 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32719 St->getPointerInfo(), St->getAlignment(),
32720 St->getMemOperand()->getFlags());
32723 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32724 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32725 dl, Val, St->getBasePtr(),
32726 St->getMemoryVT(), St->getMemOperand(), DAG);
32728 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32729 unsigned NumElems = VT.getVectorNumElements();
32730 assert(StVT != VT && "Cannot truncate to the same type");
32731 unsigned FromSz = VT.getScalarSizeInBits();
32732 unsigned ToSz = StVT.getScalarSizeInBits();
32734 // The truncating store is legal in some cases. For example
32735 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32736 // are designated for truncate store.
32737 // In this case we don't need any further transformations.
32738 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32741 // From, To sizes and ElemCount must be pow of two
32742 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32743 // We are going to use the original vector elt for storing.
32744 // Accumulated smaller vector elements must be a multiple of the store size.
32745 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32747 unsigned SizeRatio = FromSz / ToSz;
32749 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32751 // Create a type on which we perform the shuffle
32752 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32753 StVT.getScalarType(), NumElems*SizeRatio);
32755 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32757 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32758 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32759 for (unsigned i = 0; i != NumElems; ++i)
32760 ShuffleVec[i] = i * SizeRatio;
32762 // Can't shuffle using an illegal type.
32763 if (!TLI.isTypeLegal(WideVecVT))
32766 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32767 DAG.getUNDEF(WideVecVT),
32769 // At this point all of the data is stored at the bottom of the
32770 // register. We now need to save it to mem.
32772 // Find the largest store unit
32773 MVT StoreType = MVT::i8;
32774 for (MVT Tp : MVT::integer_valuetypes()) {
32775 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32779 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32780 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32781 (64 <= NumElems * ToSz))
32782 StoreType = MVT::f64;
32784 // Bitcast the original vector into a vector of store-size units
32785 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32786 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32787 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32788 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32789 SmallVector<SDValue, 8> Chains;
32790 SDValue Ptr = St->getBasePtr();
32792 // Perform one or more big stores into memory.
32793 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32794 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32795 StoreType, ShuffWide,
32796 DAG.getIntPtrConstant(i, dl));
32798 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32799 St->getAlignment(), St->getMemOperand()->getFlags());
32800 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32801 Chains.push_back(Ch);
32804 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32807 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32808 // the FP state in cases where an emms may be missing.
32809 // A preferable solution to the general problem is to figure out the right
32810 // places to insert EMMS. This qualifies as a quick hack.
32812 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32813 if (VT.getSizeInBits() != 64)
32816 const Function *F = DAG.getMachineFunction().getFunction();
32817 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32819 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32820 if ((VT.isVector() ||
32821 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32822 isa<LoadSDNode>(St->getValue()) &&
32823 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32824 St->getChain().hasOneUse() && !St->isVolatile()) {
32825 SDNode* LdVal = St->getValue().getNode();
32826 LoadSDNode *Ld = nullptr;
32827 int TokenFactorIndex = -1;
32828 SmallVector<SDValue, 8> Ops;
32829 SDNode* ChainVal = St->getChain().getNode();
32830 // Must be a store of a load. We currently handle two cases: the load
32831 // is a direct child, and it's under an intervening TokenFactor. It is
32832 // possible to dig deeper under nested TokenFactors.
32833 if (ChainVal == LdVal)
32834 Ld = cast<LoadSDNode>(St->getChain());
32835 else if (St->getValue().hasOneUse() &&
32836 ChainVal->getOpcode() == ISD::TokenFactor) {
32837 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32838 if (ChainVal->getOperand(i).getNode() == LdVal) {
32839 TokenFactorIndex = i;
32840 Ld = cast<LoadSDNode>(St->getValue());
32842 Ops.push_back(ChainVal->getOperand(i));
32846 if (!Ld || !ISD::isNormalLoad(Ld))
32849 // If this is not the MMX case, i.e. we are just turning i64 load/store
32850 // into f64 load/store, avoid the transformation if there are multiple
32851 // uses of the loaded value.
32852 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32857 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32858 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32860 if (Subtarget.is64Bit() || F64IsLegal) {
32861 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32862 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32863 Ld->getPointerInfo(), Ld->getAlignment(),
32864 Ld->getMemOperand()->getFlags());
32865 SDValue NewChain = NewLd.getValue(1);
32866 if (TokenFactorIndex >= 0) {
32867 Ops.push_back(NewChain);
32868 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32870 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32871 St->getPointerInfo(), St->getAlignment(),
32872 St->getMemOperand()->getFlags());
32875 // Otherwise, lower to two pairs of 32-bit loads / stores.
32876 SDValue LoAddr = Ld->getBasePtr();
32877 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32879 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32880 Ld->getPointerInfo(), Ld->getAlignment(),
32881 Ld->getMemOperand()->getFlags());
32882 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32883 Ld->getPointerInfo().getWithOffset(4),
32884 MinAlign(Ld->getAlignment(), 4),
32885 Ld->getMemOperand()->getFlags());
32887 SDValue NewChain = LoLd.getValue(1);
32888 if (TokenFactorIndex >= 0) {
32889 Ops.push_back(LoLd);
32890 Ops.push_back(HiLd);
32891 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32894 LoAddr = St->getBasePtr();
32895 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32898 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32899 St->getAlignment(), St->getMemOperand()->getFlags());
32900 SDValue HiSt = DAG.getStore(
32901 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32902 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32903 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32906 // This is similar to the above case, but here we handle a scalar 64-bit
32907 // integer store that is extracted from a vector on a 32-bit target.
32908 // If we have SSE2, then we can treat it like a floating-point double
32909 // to get past legalization. The execution dependencies fixup pass will
32910 // choose the optimal machine instruction for the store if this really is
32911 // an integer or v2f32 rather than an f64.
32912 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32913 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32914 SDValue OldExtract = St->getOperand(1);
32915 SDValue ExtOp0 = OldExtract.getOperand(0);
32916 unsigned VecSize = ExtOp0.getValueSizeInBits();
32917 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
32918 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
32919 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
32920 BitCast, OldExtract.getOperand(1));
32921 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
32922 St->getPointerInfo(), St->getAlignment(),
32923 St->getMemOperand()->getFlags());
32929 /// Return 'true' if this vector operation is "horizontal"
32930 /// and return the operands for the horizontal operation in LHS and RHS. A
32931 /// horizontal operation performs the binary operation on successive elements
32932 /// of its first operand, then on successive elements of its second operand,
32933 /// returning the resulting values in a vector. For example, if
32934 /// A = < float a0, float a1, float a2, float a3 >
32936 /// B = < float b0, float b1, float b2, float b3 >
32937 /// then the result of doing a horizontal operation on A and B is
32938 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
32939 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
32940 /// A horizontal-op B, for some already available A and B, and if so then LHS is
32941 /// set to A, RHS to B, and the routine returns 'true'.
32942 /// Note that the binary operation should have the property that if one of the
32943 /// operands is UNDEF then the result is UNDEF.
32944 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
32945 // Look for the following pattern: if
32946 // A = < float a0, float a1, float a2, float a3 >
32947 // B = < float b0, float b1, float b2, float b3 >
32949 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
32950 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
32951 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
32952 // which is A horizontal-op B.
32954 // At least one of the operands should be a vector shuffle.
32955 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
32956 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
32959 MVT VT = LHS.getSimpleValueType();
32961 assert((VT.is128BitVector() || VT.is256BitVector()) &&
32962 "Unsupported vector type for horizontal add/sub");
32964 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
32965 // operate independently on 128-bit lanes.
32966 unsigned NumElts = VT.getVectorNumElements();
32967 unsigned NumLanes = VT.getSizeInBits()/128;
32968 unsigned NumLaneElts = NumElts / NumLanes;
32969 assert((NumLaneElts % 2 == 0) &&
32970 "Vector type should have an even number of elements in each lane");
32971 unsigned HalfLaneElts = NumLaneElts/2;
32973 // View LHS in the form
32974 // LHS = VECTOR_SHUFFLE A, B, LMask
32975 // If LHS is not a shuffle then pretend it is the shuffle
32976 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
32977 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
32980 SmallVector<int, 16> LMask(NumElts);
32981 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
32982 if (!LHS.getOperand(0).isUndef())
32983 A = LHS.getOperand(0);
32984 if (!LHS.getOperand(1).isUndef())
32985 B = LHS.getOperand(1);
32986 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
32987 std::copy(Mask.begin(), Mask.end(), LMask.begin());
32989 if (!LHS.isUndef())
32991 for (unsigned i = 0; i != NumElts; ++i)
32995 // Likewise, view RHS in the form
32996 // RHS = VECTOR_SHUFFLE C, D, RMask
32998 SmallVector<int, 16> RMask(NumElts);
32999 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33000 if (!RHS.getOperand(0).isUndef())
33001 C = RHS.getOperand(0);
33002 if (!RHS.getOperand(1).isUndef())
33003 D = RHS.getOperand(1);
33004 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33005 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33007 if (!RHS.isUndef())
33009 for (unsigned i = 0; i != NumElts; ++i)
33013 // Check that the shuffles are both shuffling the same vectors.
33014 if (!(A == C && B == D) && !(A == D && B == C))
33017 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33018 if (!A.getNode() && !B.getNode())
33021 // If A and B occur in reverse order in RHS, then "swap" them (which means
33022 // rewriting the mask).
33024 ShuffleVectorSDNode::commuteMask(RMask);
33026 // At this point LHS and RHS are equivalent to
33027 // LHS = VECTOR_SHUFFLE A, B, LMask
33028 // RHS = VECTOR_SHUFFLE A, B, RMask
33029 // Check that the masks correspond to performing a horizontal operation.
33030 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33031 for (unsigned i = 0; i != NumLaneElts; ++i) {
33032 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33034 // Ignore any UNDEF components.
33035 if (LIdx < 0 || RIdx < 0 ||
33036 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33037 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33040 // Check that successive elements are being operated on. If not, this is
33041 // not a horizontal operation.
33042 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33043 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33044 if (!(LIdx == Index && RIdx == Index + 1) &&
33045 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33050 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33051 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33055 /// Do target-specific dag combines on floating-point adds/subs.
33056 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33057 const X86Subtarget &Subtarget) {
33058 EVT VT = N->getValueType(0);
33059 SDValue LHS = N->getOperand(0);
33060 SDValue RHS = N->getOperand(1);
33061 bool IsFadd = N->getOpcode() == ISD::FADD;
33062 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33064 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33065 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33066 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33067 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33068 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33069 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33074 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33076 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33077 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33078 const X86Subtarget &Subtarget,
33080 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33081 SDValue Src = N->getOperand(0);
33082 unsigned Opcode = Src.getOpcode();
33083 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33085 EVT VT = N->getValueType(0);
33086 EVT SrcVT = Src.getValueType();
33088 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33089 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33091 // Repeated operand, so we are only trading one output truncation for
33092 // one input truncation.
33096 // See if either operand has been extended from a smaller/equal size to
33097 // the truncation size, allowing a truncation to combine with the extend.
33098 unsigned Opcode0 = Op0.getOpcode();
33099 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33100 Opcode0 == ISD::ZERO_EXTEND) &&
33101 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33104 unsigned Opcode1 = Op1.getOpcode();
33105 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33106 Opcode1 == ISD::ZERO_EXTEND) &&
33107 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33110 // See if either operand is a single use constant which can be constant
33112 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33113 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33114 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33115 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33118 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33119 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33120 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33121 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33124 // Don't combine if the operation has other uses.
33125 if (!N->isOnlyUserOf(Src.getNode()))
33128 // Only support vector truncation for now.
33129 // TODO: i64 scalar math would benefit as well.
33130 if (!VT.isVector())
33133 // In most cases its only worth pre-truncating if we're only facing the cost
33134 // of one truncation.
33135 // i.e. if one of the inputs will constant fold or the input is repeated.
33140 SDValue Op0 = Src.getOperand(0);
33141 SDValue Op1 = Src.getOperand(1);
33142 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33143 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33144 return TruncateArithmetic(Op0, Op1);
33149 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33150 // better to truncate if we have the chance.
33151 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33152 !TLI.isOperationLegal(Opcode, SrcVT))
33153 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33156 SDValue Op0 = Src.getOperand(0);
33157 SDValue Op1 = Src.getOperand(1);
33158 if (TLI.isOperationLegal(Opcode, VT) &&
33159 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33160 return TruncateArithmetic(Op0, Op1);
33168 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33170 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33171 SmallVector<SDValue, 8> &Regs) {
33172 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33173 Regs[0].getValueType() == MVT::v2i64));
33174 EVT OutVT = N->getValueType(0);
33175 EVT OutSVT = OutVT.getVectorElementType();
33176 EVT InVT = Regs[0].getValueType();
33177 EVT InSVT = InVT.getVectorElementType();
33180 // First, use mask to unset all bits that won't appear in the result.
33181 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33182 "OutSVT can only be either i8 or i16.");
33184 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33185 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33186 for (auto &Reg : Regs)
33187 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33189 MVT UnpackedVT, PackedVT;
33190 if (OutSVT == MVT::i8) {
33191 UnpackedVT = MVT::v8i16;
33192 PackedVT = MVT::v16i8;
33194 UnpackedVT = MVT::v4i32;
33195 PackedVT = MVT::v8i16;
33198 // In each iteration, truncate the type by a half size.
33199 auto RegNum = Regs.size();
33200 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33201 j < e; j *= 2, RegNum /= 2) {
33202 for (unsigned i = 0; i < RegNum; i++)
33203 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33204 for (unsigned i = 0; i < RegNum / 2; i++)
33205 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33209 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33210 // then extract a subvector as the result since v8i8 is not a legal type.
33211 if (OutVT == MVT::v8i8) {
33212 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33213 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33214 DAG.getIntPtrConstant(0, DL));
33216 } else if (RegNum > 1) {
33217 Regs.resize(RegNum);
33218 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33223 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33225 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33227 SmallVector<SDValue, 8> &Regs) {
33228 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33229 EVT OutVT = N->getValueType(0);
33232 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33233 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33234 for (auto &Reg : Regs) {
33235 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33237 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33241 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33242 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33245 if (Regs.size() > 2) {
33246 Regs.resize(Regs.size() / 2);
33247 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33252 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33253 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33254 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33255 /// element that is extracted from a vector and then truncated, and it is
33256 /// difficult to do this optimization based on them.
33257 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33258 const X86Subtarget &Subtarget) {
33259 EVT OutVT = N->getValueType(0);
33260 if (!OutVT.isVector())
33263 SDValue In = N->getOperand(0);
33264 if (!In.getValueType().isSimple())
33267 EVT InVT = In.getValueType();
33268 unsigned NumElems = OutVT.getVectorNumElements();
33270 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33271 // SSE2, and we need to take care of it specially.
33272 // AVX512 provides vpmovdb.
33273 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33276 EVT OutSVT = OutVT.getVectorElementType();
33277 EVT InSVT = InVT.getVectorElementType();
33278 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33279 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33283 // SSSE3's pshufb results in less instructions in the cases below.
33284 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33285 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33286 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33291 // Split a long vector into vectors of legal type.
33292 unsigned RegNum = InVT.getSizeInBits() / 128;
33293 SmallVector<SDValue, 8> SubVec(RegNum);
33294 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33295 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33297 for (unsigned i = 0; i < RegNum; i++)
33298 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33299 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33301 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33302 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33303 // truncate 2 x v4i32 to v8i16.
33304 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33305 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33306 else if (InSVT == MVT::i32)
33307 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33312 /// This function transforms vector truncation of 'all or none' bits values.
33313 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33314 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33316 const X86Subtarget &Subtarget) {
33317 // Requires SSE2 but AVX512 has fast truncate.
33318 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33321 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33324 SDValue In = N->getOperand(0);
33325 if (!In.getValueType().isSimple())
33328 MVT VT = N->getValueType(0).getSimpleVT();
33329 MVT SVT = VT.getScalarType();
33331 MVT InVT = In.getValueType().getSimpleVT();
33332 MVT InSVT = InVT.getScalarType();
33334 // Use PACKSS if the input is a splatted sign bit.
33335 // e.g. Comparison result, sext_in_reg, etc.
33336 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33337 if (NumSignBits != InSVT.getSizeInBits())
33340 // Check we have a truncation suited for PACKSS.
33341 if (!VT.is128BitVector() && !VT.is256BitVector())
33343 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33345 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33348 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33351 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33352 const X86Subtarget &Subtarget) {
33353 EVT VT = N->getValueType(0);
33354 SDValue Src = N->getOperand(0);
33357 // Attempt to pre-truncate inputs to arithmetic ops instead.
33358 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33361 // Try to detect AVG pattern first.
33362 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33365 // Try to combine truncation with unsigned saturation.
33366 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33369 // The bitcast source is a direct mmx result.
33370 // Detect bitcasts between i32 to x86mmx
33371 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33372 SDValue BCSrc = Src.getOperand(0);
33373 if (BCSrc.getValueType() == MVT::x86mmx)
33374 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33377 // Try to truncate extended sign bits with PACKSS.
33378 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33381 return combineVectorTruncation(N, DAG, Subtarget);
33384 /// Returns the negated value if the node \p N flips sign of FP value.
33386 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33387 /// AVX512F does not have FXOR, so FNEG is lowered as
33388 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33389 /// In this case we go though all bitcasts.
33390 static SDValue isFNEG(SDNode *N) {
33391 if (N->getOpcode() == ISD::FNEG)
33392 return N->getOperand(0);
33394 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33395 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33398 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33399 if (!Op1.getValueType().isFloatingPoint())
33402 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33404 unsigned EltBits = Op1.getScalarValueSizeInBits();
33405 auto isSignMask = [&](const ConstantFP *C) {
33406 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33409 // There is more than one way to represent the same constant on
33410 // the different X86 targets. The type of the node may also depend on size.
33411 // - load scalar value and broadcast
33412 // - BUILD_VECTOR node
33413 // - load from a constant pool.
33414 // We check all variants here.
33415 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33416 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33417 if (isSignMask(cast<ConstantFP>(C)))
33420 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33421 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33422 if (isSignMask(CN->getConstantFPValue()))
33425 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33426 if (C->getType()->isVectorTy()) {
33427 if (auto *SplatV = C->getSplatValue())
33428 if (isSignMask(cast<ConstantFP>(SplatV)))
33430 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33431 if (isSignMask(FPConst))
33437 /// Do target-specific dag combines on floating point negations.
33438 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33439 const X86Subtarget &Subtarget) {
33440 EVT OrigVT = N->getValueType(0);
33441 SDValue Arg = isFNEG(N);
33442 assert(Arg.getNode() && "N is expected to be an FNEG node");
33444 EVT VT = Arg.getValueType();
33445 EVT SVT = VT.getScalarType();
33448 // Let legalize expand this if it isn't a legal type yet.
33449 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33452 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33453 // use of a constant by performing (-0 - A*B) instead.
33454 // FIXME: Check rounding control flags as well once it becomes available.
33455 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33456 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33457 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33458 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33459 Arg.getOperand(1), Zero);
33460 return DAG.getBitcast(OrigVT, NewNode);
33463 // If we're negating an FMA node, then we can adjust the
33464 // instruction to include the extra negation.
33465 unsigned NewOpcode = 0;
33466 if (Arg.hasOneUse()) {
33467 switch (Arg.getOpcode()) {
33468 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33469 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33470 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33471 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33472 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33473 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33474 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33475 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33476 // We can't handle scalar intrinsic node here because it would only
33477 // invert one element and not the whole vector. But we could try to handle
33478 // a negation of the lower element only.
33482 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33483 Arg.getNode()->ops()));
33488 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33489 const X86Subtarget &Subtarget) {
33490 MVT VT = N->getSimpleValueType(0);
33491 // If we have integer vector types available, use the integer opcodes.
33492 if (VT.isVector() && Subtarget.hasSSE2()) {
33495 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33497 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33498 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33499 unsigned IntOpcode;
33500 switch (N->getOpcode()) {
33501 default: llvm_unreachable("Unexpected FP logic op");
33502 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33503 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33504 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33505 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33507 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33508 return DAG.getBitcast(VT, IntOp);
33513 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33514 TargetLowering::DAGCombinerInfo &DCI,
33515 const X86Subtarget &Subtarget) {
33516 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33519 if (DCI.isBeforeLegalizeOps())
33522 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33525 if (Subtarget.hasCMov())
33526 if (SDValue RV = combineIntegerAbs(N, DAG))
33529 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33533 return combineFneg(N, DAG, Subtarget);
33538 static bool isNullFPScalarOrVectorConst(SDValue V) {
33539 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33542 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33543 /// undefined elements), return a zero constant that may be used to fold away
33544 /// that value. In the case of a vector, the returned constant will not contain
33545 /// undefined elements even if the input parameter does. This makes it suitable
33546 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33547 /// an undef should not propagate.
33548 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33549 const X86Subtarget &Subtarget) {
33550 if (!isNullFPScalarOrVectorConst(V))
33553 if (V.getValueType().isVector())
33554 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33559 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33560 const X86Subtarget &Subtarget) {
33561 SDValue N0 = N->getOperand(0);
33562 SDValue N1 = N->getOperand(1);
33563 EVT VT = N->getValueType(0);
33566 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33567 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33568 (VT == MVT::f64 && Subtarget.hasSSE2())))
33571 auto isAllOnesConstantFP = [](SDValue V) {
33572 auto *C = dyn_cast<ConstantFPSDNode>(V);
33573 return C && C->getConstantFPValue()->isAllOnesValue();
33576 // fand (fxor X, -1), Y --> fandn X, Y
33577 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33578 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33580 // fand X, (fxor Y, -1) --> fandn Y, X
33581 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33582 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33587 /// Do target-specific dag combines on X86ISD::FAND nodes.
33588 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33589 const X86Subtarget &Subtarget) {
33590 // FAND(0.0, x) -> 0.0
33591 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33594 // FAND(x, 0.0) -> 0.0
33595 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33598 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33601 return lowerX86FPLogicOp(N, DAG, Subtarget);
33604 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33605 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33606 const X86Subtarget &Subtarget) {
33607 // FANDN(0.0, x) -> x
33608 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33609 return N->getOperand(1);
33611 // FANDN(x, 0.0) -> 0.0
33612 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33615 return lowerX86FPLogicOp(N, DAG, Subtarget);
33618 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33619 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33620 const X86Subtarget &Subtarget) {
33621 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33623 // F[X]OR(0.0, x) -> x
33624 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33625 return N->getOperand(1);
33627 // F[X]OR(x, 0.0) -> x
33628 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33629 return N->getOperand(0);
33632 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33635 return lowerX86FPLogicOp(N, DAG, Subtarget);
33638 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33639 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33640 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33642 // Only perform optimizations if UnsafeMath is used.
33643 if (!DAG.getTarget().Options.UnsafeFPMath)
33646 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33647 // into FMINC and FMAXC, which are Commutative operations.
33648 unsigned NewOp = 0;
33649 switch (N->getOpcode()) {
33650 default: llvm_unreachable("unknown opcode");
33651 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33652 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33655 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33656 N->getOperand(0), N->getOperand(1));
33659 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33660 const X86Subtarget &Subtarget) {
33661 if (Subtarget.useSoftFloat())
33664 // TODO: Check for global or instruction-level "nnan". In that case, we
33665 // should be able to lower to FMAX/FMIN alone.
33666 // TODO: If an operand is already known to be a NaN or not a NaN, this
33667 // should be an optional swap and FMAX/FMIN.
33669 EVT VT = N->getValueType(0);
33670 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33671 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33672 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33675 // This takes at least 3 instructions, so favor a library call when operating
33676 // on a scalar and minimizing code size.
33677 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33680 SDValue Op0 = N->getOperand(0);
33681 SDValue Op1 = N->getOperand(1);
33683 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33684 DAG.getDataLayout(), *DAG.getContext(), VT);
33686 // There are 4 possibilities involving NaN inputs, and these are the required
33690 // ----------------
33691 // Num | Max | Op0 |
33692 // Op0 ----------------
33693 // NaN | Op1 | NaN |
33694 // ----------------
33696 // The SSE FP max/min instructions were not designed for this case, but rather
33698 // Min = Op1 < Op0 ? Op1 : Op0
33699 // Max = Op1 > Op0 ? Op1 : Op0
33701 // So they always return Op0 if either input is a NaN. However, we can still
33702 // use those instructions for fmaxnum by selecting away a NaN input.
33704 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33705 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33706 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33707 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33709 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33710 // are NaN, the NaN value of Op1 is the result.
33711 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
33714 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33715 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33716 TargetLowering::DAGCombinerInfo &DCI,
33717 const X86Subtarget &Subtarget) {
33718 // ANDNP(0, x) -> x
33719 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33720 return N->getOperand(1);
33722 // ANDNP(x, 0) -> 0
33723 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33724 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33726 EVT VT = N->getValueType(0);
33728 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33729 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33731 SmallVector<int, 1> NonceMask; // Just a placeholder.
33732 NonceMask.push_back(0);
33733 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33734 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33736 return SDValue(); // This routine will use CombineTo to replace N.
33742 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33743 TargetLowering::DAGCombinerInfo &DCI) {
33744 // BT ignores high bits in the bit index operand.
33745 SDValue Op1 = N->getOperand(1);
33746 if (Op1.hasOneUse()) {
33747 unsigned BitWidth = Op1.getValueSizeInBits();
33748 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33750 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33751 !DCI.isBeforeLegalizeOps());
33752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33753 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33754 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
33755 DCI.CommitTargetLoweringOpt(TLO);
33760 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33761 const X86Subtarget &Subtarget) {
33762 EVT VT = N->getValueType(0);
33763 if (!VT.isVector())
33766 SDValue N0 = N->getOperand(0);
33767 SDValue N1 = N->getOperand(1);
33768 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33771 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33772 // both SSE and AVX2 since there is no sign-extended shift right
33773 // operation on a vector with 64-bit elements.
33774 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33775 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33776 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33777 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33778 SDValue N00 = N0.getOperand(0);
33780 // EXTLOAD has a better solution on AVX2,
33781 // it may be replaced with X86ISD::VSEXT node.
33782 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33783 if (!ISD::isNormalLoad(N00.getNode()))
33786 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33787 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33789 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33795 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33796 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33797 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33798 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33799 /// mode. This can eliminate extend, add, and shift instructions.
33800 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33801 const X86Subtarget &Subtarget) {
33802 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33803 Ext->getOpcode() != ISD::ZERO_EXTEND)
33806 // TODO: This should be valid for other integer types.
33807 EVT VT = Ext->getValueType(0);
33808 if (VT != MVT::i64)
33811 SDValue Add = Ext->getOperand(0);
33812 if (Add.getOpcode() != ISD::ADD)
33815 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33816 bool NSW = Add->getFlags().hasNoSignedWrap();
33817 bool NUW = Add->getFlags().hasNoUnsignedWrap();
33819 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33821 if ((Sext && !NSW) || (!Sext && !NUW))
33824 // Having a constant operand to the 'add' ensures that we are not increasing
33825 // the instruction count because the constant is extended for free below.
33826 // A constant operand can also become the displacement field of an LEA.
33827 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33831 // Don't make the 'add' bigger if there's no hope of combining it with some
33832 // other 'add' or 'shl' instruction.
33833 // TODO: It may be profitable to generate simpler LEA instructions in place
33834 // of single 'add' instructions, but the cost model for selecting an LEA
33835 // currently has a high threshold.
33836 bool HasLEAPotential = false;
33837 for (auto *User : Ext->uses()) {
33838 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33839 HasLEAPotential = true;
33843 if (!HasLEAPotential)
33846 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33847 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33848 SDValue AddOp0 = Add.getOperand(0);
33849 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33850 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33852 // The wider add is guaranteed to not wrap because both operands are
33855 Flags.setNoSignedWrap(NSW);
33856 Flags.setNoUnsignedWrap(NUW);
33857 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
33860 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33861 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33862 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33863 /// extends from AH (which we otherwise need to do contortions to access).
33864 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33865 SDValue N0 = N->getOperand(0);
33866 auto OpcodeN = N->getOpcode();
33867 auto OpcodeN0 = N0.getOpcode();
33868 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33869 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33872 EVT VT = N->getValueType(0);
33873 EVT InVT = N0.getValueType();
33874 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33877 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33878 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33879 : X86ISD::UDIVREM8_ZEXT_HREG;
33880 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33882 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33883 return R.getValue(1);
33886 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33887 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33888 /// with UNDEFs) of the input to vectors of the same size as the target type
33889 /// which then extends the lowest elements.
33890 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33891 TargetLowering::DAGCombinerInfo &DCI,
33892 const X86Subtarget &Subtarget) {
33893 unsigned Opcode = N->getOpcode();
33894 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33896 if (!DCI.isBeforeLegalizeOps())
33898 if (!Subtarget.hasSSE2())
33901 SDValue N0 = N->getOperand(0);
33902 EVT VT = N->getValueType(0);
33903 EVT SVT = VT.getScalarType();
33904 EVT InVT = N0.getValueType();
33905 EVT InSVT = InVT.getScalarType();
33907 // Input type must be a vector and we must be extending legal integer types.
33908 if (!VT.isVector())
33910 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33912 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33915 // On AVX2+ targets, if the input/output types are both legal then we will be
33916 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
33917 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
33918 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
33923 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
33924 EVT InVT = N.getValueType();
33925 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
33926 Size / InVT.getScalarSizeInBits());
33927 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
33928 DAG.getUNDEF(InVT));
33930 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
33933 // If target-size is less than 128-bits, extend to a type that would extend
33934 // to 128 bits, extend that and extract the original target vector.
33935 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
33936 unsigned Scale = 128 / VT.getSizeInBits();
33938 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
33939 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
33940 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
33941 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
33942 DAG.getIntPtrConstant(0, DL));
33945 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
33946 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
33947 // Also use this if we don't have SSE41 to allow the legalizer do its job.
33948 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
33949 (VT.is256BitVector() && Subtarget.hasInt256()) ||
33950 (VT.is512BitVector() && Subtarget.hasAVX512())) {
33951 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
33952 return Opcode == ISD::SIGN_EXTEND
33953 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
33954 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
33957 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
33958 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
33959 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
33960 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
33961 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33963 SmallVector<SDValue, 8> Opnds;
33964 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
33965 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
33966 DAG.getIntPtrConstant(Offset, DL));
33967 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
33968 SrcVec = Opcode == ISD::SIGN_EXTEND
33969 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
33970 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
33971 Opnds.push_back(SrcVec);
33973 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
33976 // On pre-AVX2 targets, split into 128-bit nodes of
33977 // ISD::*_EXTEND_VECTOR_INREG.
33978 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
33979 return SplitAndExtendInReg(128);
33981 // On pre-AVX512 targets, split into 256-bit nodes of
33982 // ISD::*_EXTEND_VECTOR_INREG.
33983 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
33984 return SplitAndExtendInReg(256);
33989 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
33990 TargetLowering::DAGCombinerInfo &DCI,
33991 const X86Subtarget &Subtarget) {
33992 SDValue N0 = N->getOperand(0);
33993 EVT VT = N->getValueType(0);
33994 EVT InVT = N0.getValueType();
33997 if (SDValue DivRem8 = getDivRem8(N, DAG))
34000 if (!DCI.isBeforeLegalizeOps()) {
34001 if (InVT == MVT::i1) {
34002 SDValue Zero = DAG.getConstant(0, DL, VT);
34003 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34004 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34009 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34010 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34011 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34012 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34013 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34014 // sext (xor Bool, -1) --> sub (zext Bool), 1
34015 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34016 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34019 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34022 if (Subtarget.hasAVX() && VT.is256BitVector())
34023 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34026 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34032 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34033 const X86Subtarget &Subtarget) {
34035 EVT VT = N->getValueType(0);
34037 // Let legalize expand this if it isn't a legal type yet.
34038 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34041 EVT ScalarVT = VT.getScalarType();
34042 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34045 SDValue A = N->getOperand(0);
34046 SDValue B = N->getOperand(1);
34047 SDValue C = N->getOperand(2);
34049 auto invertIfNegative = [](SDValue &V) {
34050 if (SDValue NegVal = isFNEG(V.getNode())) {
34057 // Do not convert the passthru input of scalar intrinsics.
34058 // FIXME: We could allow negations of the lower element only.
34059 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34060 bool NegB = invertIfNegative(B);
34061 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34063 // Negative multiplication when NegA xor NegB
34064 bool NegMul = (NegA != NegB);
34066 unsigned NewOpcode;
34068 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34070 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34073 if (N->getOpcode() == X86ISD::FMADD_RND) {
34074 switch (NewOpcode) {
34075 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34076 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34077 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34078 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34080 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34081 switch (NewOpcode) {
34082 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34083 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34084 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34085 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34087 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34088 switch (NewOpcode) {
34089 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34090 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34091 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34092 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34095 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34096 "Unexpected opcode!");
34097 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34100 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34103 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34104 TargetLowering::DAGCombinerInfo &DCI,
34105 const X86Subtarget &Subtarget) {
34106 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34107 // (and (i32 x86isd::setcc_carry), 1)
34108 // This eliminates the zext. This transformation is necessary because
34109 // ISD::SETCC is always legalized to i8.
34111 SDValue N0 = N->getOperand(0);
34112 EVT VT = N->getValueType(0);
34114 if (N0.getOpcode() == ISD::AND &&
34116 N0.getOperand(0).hasOneUse()) {
34117 SDValue N00 = N0.getOperand(0);
34118 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34119 if (!isOneConstant(N0.getOperand(1)))
34121 return DAG.getNode(ISD::AND, dl, VT,
34122 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34123 N00.getOperand(0), N00.getOperand(1)),
34124 DAG.getConstant(1, dl, VT));
34128 if (N0.getOpcode() == ISD::TRUNCATE &&
34130 N0.getOperand(0).hasOneUse()) {
34131 SDValue N00 = N0.getOperand(0);
34132 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34133 return DAG.getNode(ISD::AND, dl, VT,
34134 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34135 N00.getOperand(0), N00.getOperand(1)),
34136 DAG.getConstant(1, dl, VT));
34140 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34143 if (VT.is256BitVector())
34144 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34147 if (SDValue DivRem8 = getDivRem8(N, DAG))
34150 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34153 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34159 /// Try to map a 128-bit or larger integer comparison to vector instructions
34160 /// before type legalization splits it up into chunks.
34161 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34162 const X86Subtarget &Subtarget) {
34163 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34164 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34166 // We're looking for an oversized integer equality comparison, but ignore a
34167 // comparison with zero because that gets special treatment in EmitTest().
34168 SDValue X = SetCC->getOperand(0);
34169 SDValue Y = SetCC->getOperand(1);
34170 EVT OpVT = X.getValueType();
34171 unsigned OpSize = OpVT.getSizeInBits();
34172 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34175 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34176 // TODO: Add support for AVX-512.
34177 EVT VT = SetCC->getValueType(0);
34179 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34180 (OpSize == 256 && Subtarget.hasAVX2())) {
34181 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34182 SDValue VecX = DAG.getBitcast(VecVT, X);
34183 SDValue VecY = DAG.getBitcast(VecVT, Y);
34185 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34186 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34187 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34188 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34189 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34190 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34191 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34192 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34194 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34200 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34201 const X86Subtarget &Subtarget) {
34202 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34203 SDValue LHS = N->getOperand(0);
34204 SDValue RHS = N->getOperand(1);
34205 EVT VT = N->getValueType(0);
34208 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34209 EVT OpVT = LHS.getValueType();
34210 // 0-x == y --> x+y == 0
34211 // 0-x != y --> x+y != 0
34212 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34214 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34215 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34217 // x == 0-y --> x+y == 0
34218 // x != 0-y --> x+y != 0
34219 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34221 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34222 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34225 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34229 if (VT.getScalarType() == MVT::i1 &&
34230 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34232 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34233 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34234 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34236 if (!IsSEXT0 || !IsVZero1) {
34237 // Swap the operands and update the condition code.
34238 std::swap(LHS, RHS);
34239 CC = ISD::getSetCCSwappedOperands(CC);
34241 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34242 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34243 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34246 if (IsSEXT0 && IsVZero1) {
34247 assert(VT == LHS.getOperand(0).getValueType() &&
34248 "Uexpected operand type");
34249 if (CC == ISD::SETGT)
34250 return DAG.getConstant(0, DL, VT);
34251 if (CC == ISD::SETLE)
34252 return DAG.getConstant(1, DL, VT);
34253 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34254 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34256 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34257 "Unexpected condition code!");
34258 return LHS.getOperand(0);
34262 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34263 // to avoid scalarization via legalization because v4i32 is not a legal type.
34264 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34265 LHS.getValueType() == MVT::v4f32)
34266 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34271 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34273 // Gather and Scatter instructions use k-registers for masks. The type of
34274 // the masks is v*i1. So the mask will be truncated anyway.
34275 // The SIGN_EXTEND_INREG my be dropped.
34276 SDValue Mask = N->getOperand(2);
34277 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34278 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34279 NewOps[2] = Mask.getOperand(0);
34280 DAG.UpdateNodeOperands(N, NewOps);
34285 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34286 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34287 const X86Subtarget &Subtarget) {
34289 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34290 SDValue EFLAGS = N->getOperand(1);
34292 // Try to simplify the EFLAGS and condition code operands.
34293 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34294 return getSETCC(CC, Flags, DL, DAG);
34299 /// Optimize branch condition evaluation.
34300 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34301 const X86Subtarget &Subtarget) {
34303 SDValue EFLAGS = N->getOperand(3);
34304 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34306 // Try to simplify the EFLAGS and condition code operands.
34307 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34308 // RAUW them under us.
34309 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34310 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34311 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34312 N->getOperand(1), Cond, Flags);
34318 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34319 SelectionDAG &DAG) {
34320 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34321 // optimize away operation when it's from a constant.
34323 // The general transformation is:
34324 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34325 // AND(VECTOR_CMP(x,y), constant2)
34326 // constant2 = UNARYOP(constant)
34328 // Early exit if this isn't a vector operation, the operand of the
34329 // unary operation isn't a bitwise AND, or if the sizes of the operations
34330 // aren't the same.
34331 EVT VT = N->getValueType(0);
34332 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34333 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34334 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34337 // Now check that the other operand of the AND is a constant. We could
34338 // make the transformation for non-constant splats as well, but it's unclear
34339 // that would be a benefit as it would not eliminate any operations, just
34340 // perform one more step in scalar code before moving to the vector unit.
34341 if (BuildVectorSDNode *BV =
34342 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34343 // Bail out if the vector isn't a constant.
34344 if (!BV->isConstant())
34347 // Everything checks out. Build up the new and improved node.
34349 EVT IntVT = BV->getValueType(0);
34350 // Create a new constant of the appropriate type for the transformed
34352 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34353 // The AND node needs bitcasts to/from an integer vector type around it.
34354 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34355 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34356 N->getOperand(0)->getOperand(0), MaskConst);
34357 SDValue Res = DAG.getBitcast(VT, NewAnd);
34364 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34365 const X86Subtarget &Subtarget) {
34366 SDValue Op0 = N->getOperand(0);
34367 EVT VT = N->getValueType(0);
34368 EVT InVT = Op0.getValueType();
34369 EVT InSVT = InVT.getScalarType();
34370 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34372 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34373 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34374 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34376 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34377 InVT.getVectorNumElements());
34378 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34380 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34381 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34383 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34386 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34387 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34388 // the optimization here.
34389 if (DAG.SignBitIsZero(Op0))
34390 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34395 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34396 const X86Subtarget &Subtarget) {
34397 // First try to optimize away the conversion entirely when it's
34398 // conditionally from a constant. Vectors only.
34399 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34402 // Now move on to more general possibilities.
34403 SDValue Op0 = N->getOperand(0);
34404 EVT VT = N->getValueType(0);
34405 EVT InVT = Op0.getValueType();
34406 EVT InSVT = InVT.getScalarType();
34408 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34409 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34410 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34411 if (InVT.isVector() &&
34412 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34413 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34415 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34416 InVT.getVectorNumElements());
34417 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34418 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34421 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34422 // vectors and scalars, see if we know that the upper bits are all the sign
34423 // bit, in which case we can truncate the input to i32 and convert from that.
34424 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34425 unsigned BitWidth = InVT.getScalarSizeInBits();
34426 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34427 if (NumSignBits >= (BitWidth - 31)) {
34428 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34429 if (InVT.isVector())
34430 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34431 InVT.getVectorNumElements());
34433 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34434 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34438 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34439 // a 32-bit target where SSE doesn't support i64->FP operations.
34440 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34441 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34442 EVT LdVT = Ld->getValueType(0);
34444 // This transformation is not supported if the result type is f16 or f128.
34445 if (VT == MVT::f16 || VT == MVT::f128)
34448 if (!Ld->isVolatile() && !VT.isVector() &&
34449 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34450 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34451 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34452 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34453 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34460 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34461 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34462 X86TargetLowering::DAGCombinerInfo &DCI) {
34463 // When legalizing carry, we create carries via add X, -1
34464 // If that comes from an actual carry, via setcc, we use the
34466 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34467 SDValue Carry = N->getOperand(0);
34468 while (Carry.getOpcode() == ISD::TRUNCATE ||
34469 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34470 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34471 Carry.getOpcode() == ISD::ANY_EXTEND ||
34472 (Carry.getOpcode() == ISD::AND &&
34473 isOneConstant(Carry.getOperand(1))))
34474 Carry = Carry.getOperand(0);
34476 if (Carry.getOpcode() == ISD::SETCC ||
34477 Carry.getOpcode() == X86ISD::SETCC ||
34478 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34479 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34480 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34487 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34488 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34489 X86TargetLowering::DAGCombinerInfo &DCI) {
34490 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34491 // the result is either zero or one (depending on the input carry bit).
34492 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34493 if (X86::isZeroNode(N->getOperand(0)) &&
34494 X86::isZeroNode(N->getOperand(1)) &&
34495 // We don't have a good way to replace an EFLAGS use, so only do this when
34497 SDValue(N, 1).use_empty()) {
34499 EVT VT = N->getValueType(0);
34500 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34501 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34502 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34503 DAG.getConstant(X86::COND_B, DL,
34506 DAG.getConstant(1, DL, VT));
34507 return DCI.CombineTo(N, Res1, CarryOut);
34513 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34514 /// which is more useful than 0/1 in some cases.
34515 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34517 // "Condition code B" is also known as "the carry flag" (CF).
34518 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34519 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34520 MVT VT = N->getSimpleValueType(0);
34522 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34524 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34525 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34528 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34529 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34530 /// with CMP+{ADC, SBB}.
34531 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34532 bool IsSub = N->getOpcode() == ISD::SUB;
34533 SDValue X = N->getOperand(0);
34534 SDValue Y = N->getOperand(1);
34536 // If this is an add, canonicalize a zext operand to the RHS.
34537 // TODO: Incomplete? What if both sides are zexts?
34538 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34539 Y.getOpcode() != ISD::ZERO_EXTEND)
34542 // Look through a one-use zext.
34543 bool PeekedThroughZext = false;
34544 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34545 Y = Y.getOperand(0);
34546 PeekedThroughZext = true;
34549 // If this is an add, canonicalize a setcc operand to the RHS.
34550 // TODO: Incomplete? What if both sides are setcc?
34551 // TODO: Should we allow peeking through a zext of the other operand?
34552 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34553 Y.getOpcode() != X86ISD::SETCC)
34556 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34560 EVT VT = N->getValueType(0);
34561 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34563 if (CC == X86::COND_B) {
34564 // X + SETB Z --> X + (mask SBB Z, Z)
34565 // X - SETB Z --> X - (mask SBB Z, Z)
34566 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34567 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34568 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34569 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34570 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34573 if (CC == X86::COND_A) {
34574 SDValue EFLAGS = Y->getOperand(1);
34575 // Try to convert COND_A into COND_B in an attempt to facilitate
34576 // materializing "setb reg".
34578 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34579 // cannot take an immediate as its first operand.
34581 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34582 EFLAGS.getValueType().isInteger() &&
34583 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34584 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34585 EFLAGS.getNode()->getVTList(),
34586 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34587 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34588 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34589 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34590 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34591 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34595 if (CC != X86::COND_E && CC != X86::COND_NE)
34598 SDValue Cmp = Y.getOperand(1);
34599 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34600 !X86::isZeroNode(Cmp.getOperand(1)) ||
34601 !Cmp.getOperand(0).getValueType().isInteger())
34604 // (cmp Z, 1) sets the carry flag if Z is 0.
34605 SDValue Z = Cmp.getOperand(0);
34606 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34607 DAG.getConstant(1, DL, Z.getValueType()));
34609 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34611 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34612 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34613 if (CC == X86::COND_NE)
34614 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34615 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34617 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34618 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34619 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34620 DAG.getConstant(0, DL, VT), NewCmp);
34623 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34624 const X86Subtarget &Subtarget) {
34625 SDValue MulOp = N->getOperand(0);
34626 SDValue Phi = N->getOperand(1);
34628 if (MulOp.getOpcode() != ISD::MUL)
34629 std::swap(MulOp, Phi);
34630 if (MulOp.getOpcode() != ISD::MUL)
34634 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34637 EVT VT = N->getValueType(0);
34639 unsigned RegSize = 128;
34640 if (Subtarget.hasBWI())
34642 else if (Subtarget.hasAVX2())
34644 unsigned VectorSize = VT.getVectorNumElements() * 16;
34645 // If the vector size is less than 128, or greater than the supported RegSize,
34646 // do not use PMADD.
34647 if (VectorSize < 128 || VectorSize > RegSize)
34651 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34652 VT.getVectorNumElements());
34653 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34654 VT.getVectorNumElements() / 2);
34656 // Shrink the operands of mul.
34657 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34658 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34660 // Madd vector size is half of the original vector size
34661 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34662 // Fill the rest of the output with 0
34663 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34664 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34665 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34668 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34669 const X86Subtarget &Subtarget) {
34671 EVT VT = N->getValueType(0);
34672 SDValue Op0 = N->getOperand(0);
34673 SDValue Op1 = N->getOperand(1);
34675 // TODO: There's nothing special about i32, any integer type above i16 should
34676 // work just as well.
34677 if (!VT.isVector() || !VT.isSimple() ||
34678 !(VT.getVectorElementType() == MVT::i32))
34681 unsigned RegSize = 128;
34682 if (Subtarget.hasBWI())
34684 else if (Subtarget.hasAVX2())
34687 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34688 // TODO: We should be able to handle larger vectors by splitting them before
34689 // feeding them into several SADs, and then reducing over those.
34690 if (VT.getSizeInBits() / 4 > RegSize)
34693 // We know N is a reduction add, which means one of its operands is a phi.
34694 // To match SAD, we need the other operand to be a vector select.
34695 SDValue SelectOp, Phi;
34696 if (Op0.getOpcode() == ISD::VSELECT) {
34699 } else if (Op1.getOpcode() == ISD::VSELECT) {
34705 // Check whether we have an abs-diff pattern feeding into the select.
34706 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34709 // SAD pattern detected. Now build a SAD instruction and an addition for
34710 // reduction. Note that the number of elements of the result of SAD is less
34711 // than the number of elements of its input. Therefore, we could only update
34712 // part of elements in the reduction vector.
34713 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34715 // The output of PSADBW is a vector of i64.
34716 // We need to turn the vector of i64 into a vector of i32.
34717 // If the reduction vector is at least as wide as the psadbw result, just
34718 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34720 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34721 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34722 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34724 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34726 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34727 // Update part of elements of the reduction vector. This is done by first
34728 // extracting a sub-vector from it, updating this sub-vector, and inserting
34730 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34731 DAG.getIntPtrConstant(0, DL));
34732 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34733 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34734 DAG.getIntPtrConstant(0, DL));
34736 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34739 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34740 const X86Subtarget &Subtarget) {
34741 const SDNodeFlags Flags = N->getFlags();
34742 if (Flags.hasVectorReduction()) {
34743 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34745 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34748 EVT VT = N->getValueType(0);
34749 SDValue Op0 = N->getOperand(0);
34750 SDValue Op1 = N->getOperand(1);
34752 // Try to synthesize horizontal adds from adds of shuffles.
34753 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34754 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34755 isHorizontalBinOp(Op0, Op1, true))
34756 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34758 return combineAddOrSubToADCOrSBB(N, DAG);
34761 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34762 const X86Subtarget &Subtarget) {
34763 SDValue Op0 = N->getOperand(0);
34764 SDValue Op1 = N->getOperand(1);
34766 // X86 can't encode an immediate LHS of a sub. See if we can push the
34767 // negation into a preceding instruction.
34768 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34769 // If the RHS of the sub is a XOR with one use and a constant, invert the
34770 // immediate. Then add one to the LHS of the sub so we can turn
34771 // X-Y -> X+~Y+1, saving one register.
34772 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34773 isa<ConstantSDNode>(Op1.getOperand(1))) {
34774 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34775 EVT VT = Op0.getValueType();
34776 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34778 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34779 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34780 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34784 // Try to synthesize horizontal subs from subs of shuffles.
34785 EVT VT = N->getValueType(0);
34786 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34787 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34788 isHorizontalBinOp(Op0, Op1, false))
34789 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34791 return combineAddOrSubToADCOrSBB(N, DAG);
34794 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34795 TargetLowering::DAGCombinerInfo &DCI,
34796 const X86Subtarget &Subtarget) {
34797 if (DCI.isBeforeLegalize())
34801 unsigned Opcode = N->getOpcode();
34802 MVT VT = N->getSimpleValueType(0);
34803 MVT SVT = VT.getVectorElementType();
34804 unsigned NumElts = VT.getVectorNumElements();
34805 unsigned EltSizeInBits = SVT.getSizeInBits();
34807 SDValue Op = N->getOperand(0);
34808 MVT OpVT = Op.getSimpleValueType();
34809 MVT OpEltVT = OpVT.getVectorElementType();
34810 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34811 unsigned InputBits = OpEltSizeInBits * NumElts;
34813 // Perform any constant folding.
34814 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34816 SmallVector<APInt, 64> EltBits;
34817 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34818 APInt Undefs(NumElts, 0);
34819 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34821 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34822 for (unsigned i = 0; i != NumElts; ++i) {
34823 if (UndefElts[i]) {
34827 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34828 : EltBits[i].sextOrTrunc(EltSizeInBits);
34830 return getConstVector(Vals, Undefs, VT, DAG, DL);
34833 // (vzext (bitcast (vzext (x)) -> (vzext x)
34834 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34835 SDValue V = peekThroughBitcasts(Op);
34836 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34837 MVT InnerVT = V.getSimpleValueType();
34838 MVT InnerEltVT = InnerVT.getVectorElementType();
34840 // If the element sizes match exactly, we can just do one larger vzext. This
34841 // is always an exact type match as vzext operates on integer types.
34842 if (OpEltVT == InnerEltVT) {
34843 assert(OpVT == InnerVT && "Types must match for vzext!");
34844 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34847 // The only other way we can combine them is if only a single element of the
34848 // inner vzext is used in the input to the outer vzext.
34849 if (InnerEltVT.getSizeInBits() < InputBits)
34852 // In this case, the inner vzext is completely dead because we're going to
34853 // only look at bits inside of the low element. Just do the outer vzext on
34854 // a bitcast of the input to the inner.
34855 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34858 // Check if we can bypass extracting and re-inserting an element of an input
34859 // vector. Essentially:
34860 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34861 // TODO: Add X86ISD::VSEXT support
34862 if (Opcode == X86ISD::VZEXT &&
34863 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34864 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34865 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34866 SDValue ExtractedV = V.getOperand(0);
34867 SDValue OrigV = ExtractedV.getOperand(0);
34868 if (isNullConstant(ExtractedV.getOperand(1))) {
34869 MVT OrigVT = OrigV.getSimpleValueType();
34870 // Extract a subvector if necessary...
34871 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34872 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34873 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34874 OrigVT.getVectorNumElements() / Ratio);
34875 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34876 DAG.getIntPtrConstant(0, DL));
34878 Op = DAG.getBitcast(OpVT, OrigV);
34879 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34886 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34887 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34888 const X86Subtarget &Subtarget) {
34889 SDValue Chain = N->getOperand(0);
34890 SDValue LHS = N->getOperand(1);
34891 SDValue RHS = N->getOperand(2);
34892 MVT VT = RHS.getSimpleValueType();
34895 auto *C = dyn_cast<ConstantSDNode>(RHS);
34896 if (!C || C->getZExtValue() != 1)
34899 RHS = DAG.getConstant(-1, DL, VT);
34900 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34901 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34902 DAG.getVTList(MVT::i32, MVT::Other),
34903 {Chain, LHS, RHS}, VT, MMO);
34906 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34907 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34908 SDValue Op0 = N->getOperand(0);
34909 SDValue Op1 = N->getOperand(1);
34911 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34914 EVT VT = N->getValueType(0);
34917 return DAG.getNode(X86ISD::TESTM, DL, VT,
34918 Op0->getOperand(0), Op0->getOperand(1));
34921 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
34922 const X86Subtarget &Subtarget) {
34923 MVT VT = N->getSimpleValueType(0);
34926 if (N->getOperand(0) == N->getOperand(1)) {
34927 if (N->getOpcode() == X86ISD::PCMPEQ)
34928 return getOnesVector(VT, DAG, DL);
34929 if (N->getOpcode() == X86ISD::PCMPGT)
34930 return getZeroVector(VT, Subtarget, DAG, DL);
34936 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
34937 TargetLowering::DAGCombinerInfo &DCI,
34938 const X86Subtarget &Subtarget) {
34939 if (DCI.isBeforeLegalizeOps())
34943 SDValue Vec = N->getOperand(0);
34944 SDValue SubVec = N->getOperand(1);
34945 SDValue Idx = N->getOperand(2);
34947 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
34948 MVT OpVT = N->getSimpleValueType(0);
34949 MVT SubVecVT = SubVec.getSimpleValueType();
34951 // If this is an insert of an extract, combine to a shuffle. Don't do this
34952 // if the insert or extract can be represented with a subvector operation.
34953 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34954 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
34955 (IdxVal != 0 || !Vec.isUndef())) {
34956 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
34957 if (ExtIdxVal != 0) {
34958 int VecNumElts = OpVT.getVectorNumElements();
34959 int SubVecNumElts = SubVecVT.getVectorNumElements();
34960 SmallVector<int, 64> Mask(VecNumElts);
34961 // First create an identity shuffle mask.
34962 for (int i = 0; i != VecNumElts; ++i)
34964 // Now insert the extracted portion.
34965 for (int i = 0; i != SubVecNumElts; ++i)
34966 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
34968 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
34972 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
34974 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34975 // (load16 addr + 16), Elts/2)
34978 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34979 // (load32 addr + 32), Elts/2)
34981 // or a 16-byte or 32-byte broadcast:
34982 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34983 // (load16 addr), Elts/2)
34984 // --> X86SubVBroadcast(load16 addr)
34986 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34987 // (load32 addr), Elts/2)
34988 // --> X86SubVBroadcast(load32 addr)
34989 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
34990 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
34991 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
34992 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
34993 if (Idx2 && Idx2->getZExtValue() == 0) {
34994 SDValue SubVec2 = Vec.getOperand(1);
34995 // If needed, look through bitcasts to get to the load.
34996 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
34998 unsigned Alignment = FirstLd->getAlignment();
34999 unsigned AS = FirstLd->getAddressSpace();
35000 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35001 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35002 OpVT, AS, Alignment, &Fast) && Fast) {
35003 SDValue Ops[] = {SubVec2, SubVec};
35004 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35008 // If lower/upper loads are the same and the only users of the load, then
35009 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35010 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35011 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35012 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35013 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35016 // If this is subv_broadcast insert into both halves, use a larger
35018 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35019 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35020 SubVec.getOperand(0));
35029 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35030 DAGCombinerInfo &DCI) const {
35031 SelectionDAG &DAG = DCI.DAG;
35032 switch (N->getOpcode()) {
35034 case ISD::EXTRACT_VECTOR_ELT:
35035 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35036 case X86ISD::PEXTRW:
35037 case X86ISD::PEXTRB:
35038 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35039 case ISD::INSERT_SUBVECTOR:
35040 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35043 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35044 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
35045 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35046 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35047 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35048 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35049 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35050 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35053 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35054 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35055 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35056 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35057 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35058 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35059 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35060 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35061 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35062 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35064 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35065 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35066 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35067 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35068 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35069 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35071 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35073 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35075 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35076 case X86ISD::BT: return combineBT(N, DAG, DCI);
35077 case ISD::ANY_EXTEND:
35078 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35079 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35080 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35081 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35082 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35083 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35084 case X86ISD::VSHLI:
35085 case X86ISD::VSRAI:
35086 case X86ISD::VSRLI:
35087 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35088 case ISD::SIGN_EXTEND_VECTOR_INREG:
35089 case ISD::ZERO_EXTEND_VECTOR_INREG:
35090 case X86ISD::VSEXT:
35091 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35092 case X86ISD::PINSRB:
35093 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35094 case X86ISD::SHUFP: // Handle all target specific shuffles
35095 case X86ISD::INSERTPS:
35096 case X86ISD::PALIGNR:
35097 case X86ISD::VSHLDQ:
35098 case X86ISD::VSRLDQ:
35099 case X86ISD::BLENDI:
35100 case X86ISD::UNPCKH:
35101 case X86ISD::UNPCKL:
35102 case X86ISD::MOVHLPS:
35103 case X86ISD::MOVLHPS:
35104 case X86ISD::PSHUFB:
35105 case X86ISD::PSHUFD:
35106 case X86ISD::PSHUFHW:
35107 case X86ISD::PSHUFLW:
35108 case X86ISD::MOVSHDUP:
35109 case X86ISD::MOVSLDUP:
35110 case X86ISD::MOVDDUP:
35111 case X86ISD::MOVSS:
35112 case X86ISD::MOVSD:
35113 case X86ISD::VPPERM:
35114 case X86ISD::VPERMI:
35115 case X86ISD::VPERMV:
35116 case X86ISD::VPERMV3:
35117 case X86ISD::VPERMIV3:
35118 case X86ISD::VPERMIL2:
35119 case X86ISD::VPERMILPI:
35120 case X86ISD::VPERMILPV:
35121 case X86ISD::VPERM2X128:
35122 case X86ISD::VZEXT_MOVL:
35123 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35124 case X86ISD::FMADD:
35125 case X86ISD::FMADD_RND:
35126 case X86ISD::FMADDS1_RND:
35127 case X86ISD::FMADDS3_RND:
35128 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35130 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35131 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35132 case X86ISD::TESTM: return combineTestM(N, DAG);
35133 case X86ISD::PCMPEQ:
35134 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35140 /// Return true if the target has native support for the specified value type
35141 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35142 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35143 /// some i16 instructions are slow.
35144 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35145 if (!isTypeLegal(VT))
35147 if (VT != MVT::i16)
35154 case ISD::SIGN_EXTEND:
35155 case ISD::ZERO_EXTEND:
35156 case ISD::ANY_EXTEND:
35169 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35170 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35171 /// we don't adjust the stack we clobber the first frame index.
35172 /// See X86InstrInfo::copyPhysReg.
35173 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35174 const MachineRegisterInfo &MRI = MF.getRegInfo();
35175 return any_of(MRI.reg_instructions(X86::EFLAGS),
35176 [](const MachineInstr &RI) { return RI.isCopy(); });
35179 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35180 if (hasCopyImplyingStackAdjustment(MF)) {
35181 MachineFrameInfo &MFI = MF.getFrameInfo();
35182 MFI.setHasCopyImplyingStackAdjustment(true);
35185 TargetLoweringBase::finalizeLowering(MF);
35188 /// This method query the target whether it is beneficial for dag combiner to
35189 /// promote the specified node. If true, it should return the desired promotion
35190 /// type by reference.
35191 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35192 EVT VT = Op.getValueType();
35193 if (VT != MVT::i16)
35196 bool Promote = false;
35197 bool Commute = false;
35198 switch (Op.getOpcode()) {
35200 case ISD::SIGN_EXTEND:
35201 case ISD::ZERO_EXTEND:
35202 case ISD::ANY_EXTEND:
35207 SDValue N0 = Op.getOperand(0);
35208 // Look out for (store (shl (load), x)).
35209 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35222 SDValue N0 = Op.getOperand(0);
35223 SDValue N1 = Op.getOperand(1);
35224 if (!Commute && MayFoldLoad(N1))
35226 // Avoid disabling potential load folding opportunities.
35227 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35229 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35239 //===----------------------------------------------------------------------===//
35240 // X86 Inline Assembly Support
35241 //===----------------------------------------------------------------------===//
35243 // Helper to match a string separated by whitespace.
35244 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35245 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35247 for (StringRef Piece : Pieces) {
35248 if (!S.startswith(Piece)) // Check if the piece matches.
35251 S = S.substr(Piece.size());
35252 StringRef::size_type Pos = S.find_first_not_of(" \t");
35253 if (Pos == 0) // We matched a prefix.
35262 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35264 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35265 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35266 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35267 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35269 if (AsmPieces.size() == 3)
35271 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35278 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35279 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35281 const std::string &AsmStr = IA->getAsmString();
35283 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35284 if (!Ty || Ty->getBitWidth() % 16 != 0)
35287 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35288 SmallVector<StringRef, 4> AsmPieces;
35289 SplitString(AsmStr, AsmPieces, ";\n");
35291 switch (AsmPieces.size()) {
35292 default: return false;
35294 // FIXME: this should verify that we are targeting a 486 or better. If not,
35295 // we will turn this bswap into something that will be lowered to logical
35296 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35297 // lower so don't worry about this.
35299 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35300 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35301 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35302 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35303 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35304 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35305 // No need to check constraints, nothing other than the equivalent of
35306 // "=r,0" would be valid here.
35307 return IntrinsicLowering::LowerToByteSwap(CI);
35310 // rorw $$8, ${0:w} --> llvm.bswap.i16
35311 if (CI->getType()->isIntegerTy(16) &&
35312 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35313 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35314 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35316 StringRef ConstraintsStr = IA->getConstraintString();
35317 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35318 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35319 if (clobbersFlagRegisters(AsmPieces))
35320 return IntrinsicLowering::LowerToByteSwap(CI);
35324 if (CI->getType()->isIntegerTy(32) &&
35325 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35326 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35327 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35328 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35330 StringRef ConstraintsStr = IA->getConstraintString();
35331 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35332 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35333 if (clobbersFlagRegisters(AsmPieces))
35334 return IntrinsicLowering::LowerToByteSwap(CI);
35337 if (CI->getType()->isIntegerTy(64)) {
35338 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35339 if (Constraints.size() >= 2 &&
35340 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35341 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35342 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35343 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35344 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35345 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35346 return IntrinsicLowering::LowerToByteSwap(CI);
35354 /// Given a constraint letter, return the type of constraint for this target.
35355 X86TargetLowering::ConstraintType
35356 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35357 if (Constraint.size() == 1) {
35358 switch (Constraint[0]) {
35370 return C_RegisterClass;
35371 case 'k': // AVX512 masking registers.
35395 else if (Constraint.size() == 2) {
35396 switch (Constraint[0]) {
35400 switch (Constraint[1]) {
35408 return TargetLowering::getConstraintType(Constraint);
35411 /// Examine constraint type and operand type and determine a weight value.
35412 /// This object must already have been set up with the operand type
35413 /// and the current alternative constraint selected.
35414 TargetLowering::ConstraintWeight
35415 X86TargetLowering::getSingleConstraintMatchWeight(
35416 AsmOperandInfo &info, const char *constraint) const {
35417 ConstraintWeight weight = CW_Invalid;
35418 Value *CallOperandVal = info.CallOperandVal;
35419 // If we don't have a value, we can't do a match,
35420 // but allow it at the lowest weight.
35421 if (!CallOperandVal)
35423 Type *type = CallOperandVal->getType();
35424 // Look at the constraint type.
35425 switch (*constraint) {
35427 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35438 if (CallOperandVal->getType()->isIntegerTy())
35439 weight = CW_SpecificReg;
35444 if (type->isFloatingPointTy())
35445 weight = CW_SpecificReg;
35448 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35449 weight = CW_SpecificReg;
35452 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35453 if (constraint[1] == 'k') {
35454 // Support for 'Yk' (similarly to the 'k' variant below).
35455 weight = CW_SpecificReg;
35458 // Else fall through (handle "Y" constraint).
35461 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35462 weight = CW_Register;
35465 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35466 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35467 weight = CW_Register;
35470 // Enable conditional vector operations using %k<#> registers.
35471 weight = CW_SpecificReg;
35474 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35475 if (C->getZExtValue() <= 31)
35476 weight = CW_Constant;
35480 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35481 if (C->getZExtValue() <= 63)
35482 weight = CW_Constant;
35486 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35487 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35488 weight = CW_Constant;
35492 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35493 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35494 weight = CW_Constant;
35498 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35499 if (C->getZExtValue() <= 3)
35500 weight = CW_Constant;
35504 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35505 if (C->getZExtValue() <= 0xff)
35506 weight = CW_Constant;
35511 if (isa<ConstantFP>(CallOperandVal)) {
35512 weight = CW_Constant;
35516 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35517 if ((C->getSExtValue() >= -0x80000000LL) &&
35518 (C->getSExtValue() <= 0x7fffffffLL))
35519 weight = CW_Constant;
35523 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35524 if (C->getZExtValue() <= 0xffffffff)
35525 weight = CW_Constant;
35532 /// Try to replace an X constraint, which matches anything, with another that
35533 /// has more specific requirements based on the type of the corresponding
35535 const char *X86TargetLowering::
35536 LowerXConstraint(EVT ConstraintVT) const {
35537 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35538 // 'f' like normal targets.
35539 if (ConstraintVT.isFloatingPoint()) {
35540 if (Subtarget.hasSSE2())
35542 if (Subtarget.hasSSE1())
35546 return TargetLowering::LowerXConstraint(ConstraintVT);
35549 /// Lower the specified operand into the Ops vector.
35550 /// If it is invalid, don't add anything to Ops.
35551 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35552 std::string &Constraint,
35553 std::vector<SDValue>&Ops,
35554 SelectionDAG &DAG) const {
35557 // Only support length 1 constraints for now.
35558 if (Constraint.length() > 1) return;
35560 char ConstraintLetter = Constraint[0];
35561 switch (ConstraintLetter) {
35564 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35565 if (C->getZExtValue() <= 31) {
35566 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35567 Op.getValueType());
35573 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35574 if (C->getZExtValue() <= 63) {
35575 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35576 Op.getValueType());
35582 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35583 if (isInt<8>(C->getSExtValue())) {
35584 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35585 Op.getValueType());
35591 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35592 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35593 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35594 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35595 Op.getValueType());
35601 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35602 if (C->getZExtValue() <= 3) {
35603 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35604 Op.getValueType());
35610 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35611 if (C->getZExtValue() <= 255) {
35612 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35613 Op.getValueType());
35619 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35620 if (C->getZExtValue() <= 127) {
35621 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35622 Op.getValueType());
35628 // 32-bit signed value
35629 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35630 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35631 C->getSExtValue())) {
35632 // Widen to 64 bits here to get it sign extended.
35633 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35636 // FIXME gcc accepts some relocatable values here too, but only in certain
35637 // memory models; it's complicated.
35642 // 32-bit unsigned value
35643 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35644 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35645 C->getZExtValue())) {
35646 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35647 Op.getValueType());
35651 // FIXME gcc accepts some relocatable values here too, but only in certain
35652 // memory models; it's complicated.
35656 // Literal immediates are always ok.
35657 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35658 // Widen to 64 bits here to get it sign extended.
35659 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35663 // In any sort of PIC mode addresses need to be computed at runtime by
35664 // adding in a register or some sort of table lookup. These can't
35665 // be used as immediates.
35666 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35669 // If we are in non-pic codegen mode, we allow the address of a global (with
35670 // an optional displacement) to be used with 'i'.
35671 GlobalAddressSDNode *GA = nullptr;
35672 int64_t Offset = 0;
35674 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35676 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35677 Offset += GA->getOffset();
35679 } else if (Op.getOpcode() == ISD::ADD) {
35680 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35681 Offset += C->getZExtValue();
35682 Op = Op.getOperand(0);
35685 } else if (Op.getOpcode() == ISD::SUB) {
35686 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35687 Offset += -C->getZExtValue();
35688 Op = Op.getOperand(0);
35693 // Otherwise, this isn't something we can handle, reject it.
35697 const GlobalValue *GV = GA->getGlobal();
35698 // If we require an extra load to get this address, as in PIC mode, we
35699 // can't accept it.
35700 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35703 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35704 GA->getValueType(0), Offset);
35709 if (Result.getNode()) {
35710 Ops.push_back(Result);
35713 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35716 /// Check if \p RC is a general purpose register class.
35717 /// I.e., GR* or one of their variant.
35718 static bool isGRClass(const TargetRegisterClass &RC) {
35719 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35720 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35721 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35722 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35723 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35726 /// Check if \p RC is a vector register class.
35727 /// I.e., FR* / VR* or one of their variant.
35728 static bool isFRClass(const TargetRegisterClass &RC) {
35729 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35730 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35731 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35732 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35733 RC.hasSuperClassEq(&X86::VR512RegClass);
35736 std::pair<unsigned, const TargetRegisterClass *>
35737 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35738 StringRef Constraint,
35740 // First, see if this is a constraint that directly corresponds to an LLVM
35742 if (Constraint.size() == 1) {
35743 // GCC Constraint Letters
35744 switch (Constraint[0]) {
35746 // TODO: Slight differences here in allocation order and leaving
35747 // RIP in the class. Do they matter any more here than they do
35748 // in the normal allocation?
35750 if (Subtarget.hasAVX512()) {
35751 // Only supported in AVX512 or later.
35752 switch (VT.SimpleTy) {
35755 return std::make_pair(0U, &X86::VK32RegClass);
35757 return std::make_pair(0U, &X86::VK16RegClass);
35759 return std::make_pair(0U, &X86::VK8RegClass);
35761 return std::make_pair(0U, &X86::VK1RegClass);
35763 return std::make_pair(0U, &X86::VK64RegClass);
35767 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35768 if (Subtarget.is64Bit()) {
35769 if (VT == MVT::i32 || VT == MVT::f32)
35770 return std::make_pair(0U, &X86::GR32RegClass);
35771 if (VT == MVT::i16)
35772 return std::make_pair(0U, &X86::GR16RegClass);
35773 if (VT == MVT::i8 || VT == MVT::i1)
35774 return std::make_pair(0U, &X86::GR8RegClass);
35775 if (VT == MVT::i64 || VT == MVT::f64)
35776 return std::make_pair(0U, &X86::GR64RegClass);
35779 // 32-bit fallthrough
35780 case 'Q': // Q_REGS
35781 if (VT == MVT::i32 || VT == MVT::f32)
35782 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35783 if (VT == MVT::i16)
35784 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35785 if (VT == MVT::i8 || VT == MVT::i1)
35786 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35787 if (VT == MVT::i64)
35788 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35790 case 'r': // GENERAL_REGS
35791 case 'l': // INDEX_REGS
35792 if (VT == MVT::i8 || VT == MVT::i1)
35793 return std::make_pair(0U, &X86::GR8RegClass);
35794 if (VT == MVT::i16)
35795 return std::make_pair(0U, &X86::GR16RegClass);
35796 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35797 return std::make_pair(0U, &X86::GR32RegClass);
35798 return std::make_pair(0U, &X86::GR64RegClass);
35799 case 'R': // LEGACY_REGS
35800 if (VT == MVT::i8 || VT == MVT::i1)
35801 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35802 if (VT == MVT::i16)
35803 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35804 if (VT == MVT::i32 || !Subtarget.is64Bit())
35805 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35806 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35807 case 'f': // FP Stack registers.
35808 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35809 // value to the correct fpstack register class.
35810 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35811 return std::make_pair(0U, &X86::RFP32RegClass);
35812 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35813 return std::make_pair(0U, &X86::RFP64RegClass);
35814 return std::make_pair(0U, &X86::RFP80RegClass);
35815 case 'y': // MMX_REGS if MMX allowed.
35816 if (!Subtarget.hasMMX()) break;
35817 return std::make_pair(0U, &X86::VR64RegClass);
35818 case 'Y': // SSE_REGS if SSE2 allowed
35819 if (!Subtarget.hasSSE2()) break;
35822 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35823 if (!Subtarget.hasSSE1()) break;
35824 bool VConstraint = (Constraint[0] == 'v');
35826 switch (VT.SimpleTy) {
35828 // Scalar SSE types.
35831 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35832 return std::make_pair(0U, &X86::FR32XRegClass);
35833 return std::make_pair(0U, &X86::FR32RegClass);
35836 if (VConstraint && Subtarget.hasVLX())
35837 return std::make_pair(0U, &X86::FR64XRegClass);
35838 return std::make_pair(0U, &X86::FR64RegClass);
35839 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35847 if (VConstraint && Subtarget.hasVLX())
35848 return std::make_pair(0U, &X86::VR128XRegClass);
35849 return std::make_pair(0U, &X86::VR128RegClass);
35857 if (VConstraint && Subtarget.hasVLX())
35858 return std::make_pair(0U, &X86::VR256XRegClass);
35859 return std::make_pair(0U, &X86::VR256RegClass);
35864 return std::make_pair(0U, &X86::VR512RegClass);
35868 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35869 switch (Constraint[1]) {
35873 // This register class doesn't allocate k0 for masked vector operation.
35874 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35875 switch (VT.SimpleTy) {
35878 return std::make_pair(0U, &X86::VK32WMRegClass);
35880 return std::make_pair(0U, &X86::VK16WMRegClass);
35882 return std::make_pair(0U, &X86::VK8WMRegClass);
35884 return std::make_pair(0U, &X86::VK1WMRegClass);
35886 return std::make_pair(0U, &X86::VK64WMRegClass);
35893 // Use the default implementation in TargetLowering to convert the register
35894 // constraint into a member of a register class.
35895 std::pair<unsigned, const TargetRegisterClass*> Res;
35896 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35898 // Not found as a standard register?
35900 // Map st(0) -> st(7) -> ST0
35901 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35902 tolower(Constraint[1]) == 's' &&
35903 tolower(Constraint[2]) == 't' &&
35904 Constraint[3] == '(' &&
35905 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35906 Constraint[5] == ')' &&
35907 Constraint[6] == '}') {
35909 Res.first = X86::FP0+Constraint[4]-'0';
35910 Res.second = &X86::RFP80RegClass;
35914 // GCC allows "st(0)" to be called just plain "st".
35915 if (StringRef("{st}").equals_lower(Constraint)) {
35916 Res.first = X86::FP0;
35917 Res.second = &X86::RFP80RegClass;
35922 if (StringRef("{flags}").equals_lower(Constraint)) {
35923 Res.first = X86::EFLAGS;
35924 Res.second = &X86::CCRRegClass;
35928 // 'A' means [ER]AX + [ER]DX.
35929 if (Constraint == "A") {
35930 if (Subtarget.is64Bit()) {
35931 Res.first = X86::RAX;
35932 Res.second = &X86::GR64_ADRegClass;
35934 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
35935 "Expecting 64, 32 or 16 bit subtarget");
35936 Res.first = X86::EAX;
35937 Res.second = &X86::GR32_ADRegClass;
35944 // Otherwise, check to see if this is a register class of the wrong value
35945 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
35946 // turn into {ax},{dx}.
35947 // MVT::Other is used to specify clobber names.
35948 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
35949 return Res; // Correct type already, nothing to do.
35951 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
35952 // return "eax". This should even work for things like getting 64bit integer
35953 // registers when given an f64 type.
35954 const TargetRegisterClass *Class = Res.second;
35955 // The generic code will match the first register class that contains the
35956 // given register. Thus, based on the ordering of the tablegened file,
35957 // the "plain" GR classes might not come first.
35958 // Therefore, use a helper method.
35959 if (isGRClass(*Class)) {
35960 unsigned Size = VT.getSizeInBits();
35961 if (Size == 1) Size = 8;
35962 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
35964 Res.first = DestReg;
35965 Res.second = Size == 8 ? &X86::GR8RegClass
35966 : Size == 16 ? &X86::GR16RegClass
35967 : Size == 32 ? &X86::GR32RegClass
35968 : &X86::GR64RegClass;
35969 assert(Res.second->contains(Res.first) && "Register in register class");
35971 // No register found/type mismatch.
35973 Res.second = nullptr;
35975 } else if (isFRClass(*Class)) {
35976 // Handle references to XMM physical registers that got mapped into the
35977 // wrong class. This can happen with constraints like {xmm0} where the
35978 // target independent register mapper will just pick the first match it can
35979 // find, ignoring the required type.
35981 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35982 if (VT == MVT::f32 || VT == MVT::i32)
35983 Res.second = &X86::FR32RegClass;
35984 else if (VT == MVT::f64 || VT == MVT::i64)
35985 Res.second = &X86::FR64RegClass;
35986 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
35987 Res.second = &X86::VR128RegClass;
35988 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
35989 Res.second = &X86::VR256RegClass;
35990 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
35991 Res.second = &X86::VR512RegClass;
35993 // Type mismatch and not a clobber: Return an error;
35995 Res.second = nullptr;
36002 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36003 const AddrMode &AM, Type *Ty,
36004 unsigned AS) const {
36005 // Scaling factors are not free at all.
36006 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36007 // will take 2 allocations in the out of order engine instead of 1
36008 // for plain addressing mode, i.e. inst (reg1).
36010 // vaddps (%rsi,%drx), %ymm0, %ymm1
36011 // Requires two allocations (one for the load, one for the computation)
36013 // vaddps (%rsi), %ymm0, %ymm1
36014 // Requires just 1 allocation, i.e., freeing allocations for other operations
36015 // and having less micro operations to execute.
36017 // For some X86 architectures, this is even worse because for instance for
36018 // stores, the complex addressing mode forces the instruction to use the
36019 // "load" ports instead of the dedicated "store" port.
36020 // E.g., on Haswell:
36021 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36022 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36023 if (isLegalAddressingMode(DL, AM, Ty, AS))
36024 // Scale represents reg2 * scale, thus account for 1
36025 // as soon as we use a second register.
36026 return AM.Scale != 0;
36030 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36031 // Integer division on x86 is expensive. However, when aggressively optimizing
36032 // for code size, we prefer to use a div instruction, as it is usually smaller
36033 // than the alternative sequence.
36034 // The exception to this is vector division. Since x86 doesn't have vector
36035 // integer division, leaving the division as-is is a loss even in terms of
36036 // size, because it will have to be scalarized, while the alternative code
36037 // sequence can be performed in vector form.
36039 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36040 return OptSize && !VT.isVector();
36043 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36044 if (!Subtarget.is64Bit())
36047 // Update IsSplitCSR in X86MachineFunctionInfo.
36048 X86MachineFunctionInfo *AFI =
36049 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36050 AFI->setIsSplitCSR(true);
36053 void X86TargetLowering::insertCopiesSplitCSR(
36054 MachineBasicBlock *Entry,
36055 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36056 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36057 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36061 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36062 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36063 MachineBasicBlock::iterator MBBI = Entry->begin();
36064 for (const MCPhysReg *I = IStart; *I; ++I) {
36065 const TargetRegisterClass *RC = nullptr;
36066 if (X86::GR64RegClass.contains(*I))
36067 RC = &X86::GR64RegClass;
36069 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36071 unsigned NewVR = MRI->createVirtualRegister(RC);
36072 // Create copy from CSR to a virtual register.
36073 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36074 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36075 // nounwind. If we want to generalize this later, we may need to emit
36076 // CFI pseudo-instructions.
36077 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36078 Attribute::NoUnwind) &&
36079 "Function should be nounwind in insertCopiesSplitCSR!");
36080 Entry->addLiveIn(*I);
36081 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36084 // Insert the copy-back instructions right before the terminator.
36085 for (auto *Exit : Exits)
36086 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36087 TII->get(TargetOpcode::COPY), *I)
36092 bool X86TargetLowering::supportSwiftError() const {
36093 return Subtarget.is64Bit();