1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetLowering.h"
57 #include "llvm/Target/TargetOptions.h"
64 #define DEBUG_TYPE "x86-isel"
66 STATISTIC(NumTailCalls, "Number of tail calls");
68 static cl::opt<bool> ExperimentalVectorWideningLegalization(
69 "x86-experimental-vector-widening-legalization", cl::init(false),
70 cl::desc("Enable an experimental vector type legalization through widening "
71 "rather than promotion."),
74 static cl::opt<int> ExperimentalPrefLoopAlignment(
75 "x86-experimental-pref-loop-alignment", cl::init(4),
76 cl::desc("Sets the preferable loop alignment for experiments "
77 "(the last x86-experimental-pref-loop-alignment bits"
78 " of the loop header PC will be 0)."),
81 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
82 const X86Subtarget &STI)
83 : TargetLowering(TM), Subtarget(STI) {
84 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
85 X86ScalarSSEf64 = Subtarget.hasSSE2();
86 X86ScalarSSEf32 = Subtarget.hasSSE1();
87 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
89 // Set up the TargetLowering object.
91 // X86 is weird. It always uses i8 for shift amounts and setcc results.
92 setBooleanContents(ZeroOrOneBooleanContent);
93 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
94 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
96 // For 64-bit, since we have so many registers, use the ILP scheduler.
97 // For 32-bit, use the register pressure specific scheduling.
98 // For Atom, always use ILP scheduling.
99 if (Subtarget.isAtom())
100 setSchedulingPreference(Sched::ILP);
101 else if (Subtarget.is64Bit())
102 setSchedulingPreference(Sched::ILP);
104 setSchedulingPreference(Sched::RegPressure);
105 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
106 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
108 // Bypass expensive divides and use cheaper ones.
109 if (TM.getOptLevel() >= CodeGenOpt::Default) {
110 if (Subtarget.hasSlowDivide32())
111 addBypassSlowDiv(32, 8);
112 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
113 addBypassSlowDiv(64, 32);
116 if (Subtarget.isTargetKnownWindowsMSVC() ||
117 Subtarget.isTargetWindowsItanium()) {
118 // Setup Windows compiler runtime calls.
119 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
120 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
121 setLibcallName(RTLIB::SREM_I64, "_allrem");
122 setLibcallName(RTLIB::UREM_I64, "_aullrem");
123 setLibcallName(RTLIB::MUL_I64, "_allmul");
124 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
125 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
126 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
127 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
128 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
131 if (Subtarget.isTargetDarwin()) {
132 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
133 setUseUnderscoreSetJmp(false);
134 setUseUnderscoreLongJmp(false);
135 } else if (Subtarget.isTargetWindowsGNU()) {
136 // MS runtime is weird: it exports _setjmp, but longjmp!
137 setUseUnderscoreSetJmp(true);
138 setUseUnderscoreLongJmp(false);
140 setUseUnderscoreSetJmp(true);
141 setUseUnderscoreLongJmp(true);
144 // Set up the register classes.
145 addRegisterClass(MVT::i8, &X86::GR8RegClass);
146 addRegisterClass(MVT::i16, &X86::GR16RegClass);
147 addRegisterClass(MVT::i32, &X86::GR32RegClass);
148 if (Subtarget.is64Bit())
149 addRegisterClass(MVT::i64, &X86::GR64RegClass);
151 for (MVT VT : MVT::integer_valuetypes())
152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
154 // We don't accept any truncstore of integer registers.
155 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
156 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
157 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
158 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
159 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
160 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
162 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
164 // SETOEQ and SETUNE require checking two conditions.
165 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
166 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
167 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
168 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
169 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
170 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
172 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
174 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
178 if (Subtarget.is64Bit()) {
179 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
180 // f32/f64 are legal, f80 is custom.
181 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
184 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
185 } else if (!Subtarget.useSoftFloat()) {
186 // We have an algorithm for SSE2->double, and we turn this into a
187 // 64-bit FILD followed by conditional FADD for other targets.
188 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
189 // We have an algorithm for SSE2, and we turn this into a 64-bit
190 // FILD or VCVTUSI2SS/SD for other targets.
191 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
194 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
196 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
197 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
199 if (!Subtarget.useSoftFloat()) {
200 // SSE has no i16 to fp conversion, only i32.
201 if (X86ScalarSSEf32) {
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 // f32 and f64 cases are Legal, f80 case is not
204 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
206 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
207 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
211 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
214 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
216 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
217 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
219 if (!Subtarget.useSoftFloat()) {
220 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
221 // are Legal, f80 is custom lowered.
222 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
223 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
225 if (X86ScalarSSEf32) {
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 // f32 and f64 cases are Legal, f80 case is not
228 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
230 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
231 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
234 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
235 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
239 // Handle FP_TO_UINT by promoting the destination to a larger signed
241 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
242 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
245 if (Subtarget.is64Bit()) {
246 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
247 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
248 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
251 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
252 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
254 } else if (!Subtarget.useSoftFloat()) {
255 // Since AVX is a superset of SSE3, only check for SSE here.
256 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
257 // Expand FP_TO_UINT into a select.
258 // FIXME: We would like to use a Custom expander here eventually to do
259 // the optimal thing for SSE vs. the default expansion in the legalizer.
260 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
262 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
263 // With SSE3 we can use fisttpll to convert to a signed i64; without
264 // SSE, we're stuck with a fistpll.
265 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
267 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
271 if (!X86ScalarSSEf64) {
272 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
273 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
274 if (Subtarget.is64Bit()) {
275 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
276 // Without SSE, i64->f64 goes through memory.
277 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
279 } else if (!Subtarget.is64Bit())
280 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
282 // Scalar integer divide and remainder are lowered to use operations that
283 // produce two results, to match the available instructions. This exposes
284 // the two-result form to trivial CSE, which is able to combine x/y and x%y
285 // into a single instruction.
287 // Scalar integer multiply-high is also lowered to use two-result
288 // operations, to match the available instructions. However, plain multiply
289 // (low) operations are left as Legal, as there are single-result
290 // instructions for this in x86. Using the two-result multiply instructions
291 // when both high and low results are needed must be arranged by dagcombine.
292 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
293 setOperationAction(ISD::MULHS, VT, Expand);
294 setOperationAction(ISD::MULHU, VT, Expand);
295 setOperationAction(ISD::SDIV, VT, Expand);
296 setOperationAction(ISD::UDIV, VT, Expand);
297 setOperationAction(ISD::SREM, VT, Expand);
298 setOperationAction(ISD::UREM, VT, Expand);
301 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
302 if (VT == MVT::i64 && !Subtarget.is64Bit())
304 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
305 setOperationAction(ISD::ADDC, VT, Custom);
306 setOperationAction(ISD::ADDE, VT, Custom);
307 setOperationAction(ISD::SUBC, VT, Custom);
308 setOperationAction(ISD::SUBE, VT, Custom);
311 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
312 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
313 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
314 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
315 setOperationAction(ISD::BR_CC, VT, Expand);
316 setOperationAction(ISD::SELECT_CC, VT, Expand);
318 if (Subtarget.is64Bit())
319 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
320 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
321 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
322 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
323 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
325 setOperationAction(ISD::FREM , MVT::f32 , Expand);
326 setOperationAction(ISD::FREM , MVT::f64 , Expand);
327 setOperationAction(ISD::FREM , MVT::f80 , Expand);
328 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
330 // Promote the i8 variants and force them on up to i32 which has a shorter
332 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
333 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
334 if (!Subtarget.hasBMI()) {
335 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
336 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
337 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
338 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
339 if (Subtarget.is64Bit()) {
340 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
341 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
345 if (Subtarget.hasLZCNT()) {
346 // When promoting the i8 variants, force them to i32 for a shorter
348 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
349 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
352 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
355 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
356 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
357 if (Subtarget.is64Bit()) {
358 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
359 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
363 // Special handling for half-precision floating point conversions.
364 // If we don't have F16C support, then lower half float conversions
365 // into library calls.
366 if (Subtarget.useSoftFloat() ||
367 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
368 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
369 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
372 // There's never any support for operations beyond MVT::f32.
373 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
374 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
375 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
376 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
378 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
379 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
380 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
381 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
382 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
383 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
385 if (Subtarget.hasPOPCNT()) {
386 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
388 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
389 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
390 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
391 if (Subtarget.is64Bit())
392 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
395 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
397 if (!Subtarget.hasMOVBE())
398 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
400 // These should be promoted to a larger select which is supported.
401 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
402 // X86 wants to expand cmov itself.
403 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
404 setOperationAction(ISD::SELECT, VT, Custom);
405 setOperationAction(ISD::SETCC, VT, Custom);
407 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
408 if (VT == MVT::i64 && !Subtarget.is64Bit())
410 setOperationAction(ISD::SELECT, VT, Custom);
411 setOperationAction(ISD::SETCC, VT, Custom);
412 setOperationAction(ISD::SETCCE, VT, Custom);
414 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
415 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
416 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
417 // support continuation, user-level threading, and etc.. As a result, no
418 // other SjLj exception interfaces are implemented and please don't build
419 // your own exception handling based on them.
420 // LLVM/Clang supports zero-cost DWARF exception handling.
421 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
422 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
423 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
424 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
425 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
428 for (auto VT : { MVT::i32, MVT::i64 }) {
429 if (VT == MVT::i64 && !Subtarget.is64Bit())
431 setOperationAction(ISD::ConstantPool , VT, Custom);
432 setOperationAction(ISD::JumpTable , VT, Custom);
433 setOperationAction(ISD::GlobalAddress , VT, Custom);
434 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
435 setOperationAction(ISD::ExternalSymbol , VT, Custom);
436 setOperationAction(ISD::BlockAddress , VT, Custom);
439 // 64-bit shl, sra, srl (iff 32-bit x86)
440 for (auto VT : { MVT::i32, MVT::i64 }) {
441 if (VT == MVT::i64 && !Subtarget.is64Bit())
443 setOperationAction(ISD::SHL_PARTS, VT, Custom);
444 setOperationAction(ISD::SRA_PARTS, VT, Custom);
445 setOperationAction(ISD::SRL_PARTS, VT, Custom);
448 if (Subtarget.hasSSE1())
449 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
451 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
453 // Expand certain atomics
454 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
455 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
456 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
457 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
458 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
459 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
460 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
461 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
464 if (Subtarget.hasCmpxchg16b()) {
465 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
468 // FIXME - use subtarget debug flags
469 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
470 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
471 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
472 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
475 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
476 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
478 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
479 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
481 setOperationAction(ISD::TRAP, MVT::Other, Legal);
482 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
484 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
485 setOperationAction(ISD::VASTART , MVT::Other, Custom);
486 setOperationAction(ISD::VAEND , MVT::Other, Expand);
487 bool Is64Bit = Subtarget.is64Bit();
488 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
489 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
491 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
492 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
494 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
496 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
497 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
498 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
500 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
501 // f32 and f64 use SSE.
502 // Set up the FP register classes.
503 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
504 : &X86::FR32RegClass);
505 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
506 : &X86::FR64RegClass);
508 for (auto VT : { MVT::f32, MVT::f64 }) {
509 // Use ANDPD to simulate FABS.
510 setOperationAction(ISD::FABS, VT, Custom);
512 // Use XORP to simulate FNEG.
513 setOperationAction(ISD::FNEG, VT, Custom);
515 // Use ANDPD and ORPD to simulate FCOPYSIGN.
516 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
518 // We don't support sin/cos/fmod
519 setOperationAction(ISD::FSIN , VT, Expand);
520 setOperationAction(ISD::FCOS , VT, Expand);
521 setOperationAction(ISD::FSINCOS, VT, Expand);
524 // Lower this to MOVMSK plus an AND.
525 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
526 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
528 // Expand FP immediates into loads from the stack, except for the special
530 addLegalFPImmediate(APFloat(+0.0)); // xorpd
531 addLegalFPImmediate(APFloat(+0.0f)); // xorps
532 } else if (UseX87 && X86ScalarSSEf32) {
533 // Use SSE for f32, x87 for f64.
534 // Set up the FP register classes.
535 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
536 : &X86::FR32RegClass);
537 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
539 // Use ANDPS to simulate FABS.
540 setOperationAction(ISD::FABS , MVT::f32, Custom);
542 // Use XORP to simulate FNEG.
543 setOperationAction(ISD::FNEG , MVT::f32, Custom);
545 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
547 // Use ANDPS and ORPS to simulate FCOPYSIGN.
548 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
549 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
551 // We don't support sin/cos/fmod
552 setOperationAction(ISD::FSIN , MVT::f32, Expand);
553 setOperationAction(ISD::FCOS , MVT::f32, Expand);
554 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
556 // Special cases we handle for FP constants.
557 addLegalFPImmediate(APFloat(+0.0f)); // xorps
558 addLegalFPImmediate(APFloat(+0.0)); // FLD0
559 addLegalFPImmediate(APFloat(+1.0)); // FLD1
560 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
561 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
563 if (!TM.Options.UnsafeFPMath) {
564 setOperationAction(ISD::FSIN , MVT::f64, Expand);
565 setOperationAction(ISD::FCOS , MVT::f64, Expand);
566 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
569 // f32 and f64 in x87.
570 // Set up the FP register classes.
571 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
572 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
574 for (auto VT : { MVT::f32, MVT::f64 }) {
575 setOperationAction(ISD::UNDEF, VT, Expand);
576 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
578 if (!TM.Options.UnsafeFPMath) {
579 setOperationAction(ISD::FSIN , VT, Expand);
580 setOperationAction(ISD::FCOS , VT, Expand);
581 setOperationAction(ISD::FSINCOS, VT, Expand);
584 addLegalFPImmediate(APFloat(+0.0)); // FLD0
585 addLegalFPImmediate(APFloat(+1.0)); // FLD1
586 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
587 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
588 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
589 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
590 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
591 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
594 // We don't support FMA.
595 setOperationAction(ISD::FMA, MVT::f64, Expand);
596 setOperationAction(ISD::FMA, MVT::f32, Expand);
598 // Long double always uses X87, except f128 in MMX.
600 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
601 addRegisterClass(MVT::f128, &X86::FR128RegClass);
602 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
603 setOperationAction(ISD::FABS , MVT::f128, Custom);
604 setOperationAction(ISD::FNEG , MVT::f128, Custom);
605 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
608 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
609 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
610 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
612 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
613 addLegalFPImmediate(TmpFlt); // FLD0
615 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
618 APFloat TmpFlt2(+1.0);
619 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
621 addLegalFPImmediate(TmpFlt2); // FLD1
622 TmpFlt2.changeSign();
623 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
626 if (!TM.Options.UnsafeFPMath) {
627 setOperationAction(ISD::FSIN , MVT::f80, Expand);
628 setOperationAction(ISD::FCOS , MVT::f80, Expand);
629 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
632 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
633 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
634 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
635 setOperationAction(ISD::FRINT, MVT::f80, Expand);
636 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
637 setOperationAction(ISD::FMA, MVT::f80, Expand);
640 // Always use a library call for pow.
641 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
642 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
643 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
645 setOperationAction(ISD::FLOG, MVT::f80, Expand);
646 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
647 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
648 setOperationAction(ISD::FEXP, MVT::f80, Expand);
649 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
650 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
651 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
653 // Some FP actions are always expanded for vector types.
654 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
655 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
656 setOperationAction(ISD::FSIN, VT, Expand);
657 setOperationAction(ISD::FSINCOS, VT, Expand);
658 setOperationAction(ISD::FCOS, VT, Expand);
659 setOperationAction(ISD::FREM, VT, Expand);
660 setOperationAction(ISD::FPOWI, VT, Expand);
661 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
662 setOperationAction(ISD::FPOW, VT, Expand);
663 setOperationAction(ISD::FLOG, VT, Expand);
664 setOperationAction(ISD::FLOG2, VT, Expand);
665 setOperationAction(ISD::FLOG10, VT, Expand);
666 setOperationAction(ISD::FEXP, VT, Expand);
667 setOperationAction(ISD::FEXP2, VT, Expand);
670 // First set operation action for all vector types to either promote
671 // (for widening) or expand (for scalarization). Then we will selectively
672 // turn on ones that can be effectively codegen'd.
673 for (MVT VT : MVT::vector_valuetypes()) {
674 setOperationAction(ISD::SDIV, VT, Expand);
675 setOperationAction(ISD::UDIV, VT, Expand);
676 setOperationAction(ISD::SREM, VT, Expand);
677 setOperationAction(ISD::UREM, VT, Expand);
678 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
679 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
680 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
681 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
682 setOperationAction(ISD::FMA, VT, Expand);
683 setOperationAction(ISD::FFLOOR, VT, Expand);
684 setOperationAction(ISD::FCEIL, VT, Expand);
685 setOperationAction(ISD::FTRUNC, VT, Expand);
686 setOperationAction(ISD::FRINT, VT, Expand);
687 setOperationAction(ISD::FNEARBYINT, VT, Expand);
688 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
689 setOperationAction(ISD::MULHS, VT, Expand);
690 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
691 setOperationAction(ISD::MULHU, VT, Expand);
692 setOperationAction(ISD::SDIVREM, VT, Expand);
693 setOperationAction(ISD::UDIVREM, VT, Expand);
694 setOperationAction(ISD::CTPOP, VT, Expand);
695 setOperationAction(ISD::CTTZ, VT, Expand);
696 setOperationAction(ISD::CTLZ, VT, Expand);
697 setOperationAction(ISD::ROTL, VT, Expand);
698 setOperationAction(ISD::ROTR, VT, Expand);
699 setOperationAction(ISD::BSWAP, VT, Expand);
700 setOperationAction(ISD::SETCC, VT, Expand);
701 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
702 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
703 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
704 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
705 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
706 setOperationAction(ISD::TRUNCATE, VT, Expand);
707 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
708 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
709 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
710 setOperationAction(ISD::SELECT_CC, VT, Expand);
711 for (MVT InnerVT : MVT::vector_valuetypes()) {
712 setTruncStoreAction(InnerVT, VT, Expand);
714 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
715 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
717 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
718 // types, we have to deal with them whether we ask for Expansion or not.
719 // Setting Expand causes its own optimisation problems though, so leave
721 if (VT.getVectorElementType() == MVT::i1)
722 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
724 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
725 // split/scalarized right now.
726 if (VT.getVectorElementType() == MVT::f16)
727 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
731 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
732 // with -msoft-float, disable use of MMX as well.
733 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
734 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
735 // No operations on x86mmx supported, everything uses intrinsics.
738 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
739 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
740 : &X86::VR128RegClass);
742 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
743 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
744 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
745 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
746 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
747 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
748 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
749 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
750 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
753 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
754 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
757 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
758 // registers cannot be used even for integer operations.
759 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
760 : &X86::VR128RegClass);
761 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
762 : &X86::VR128RegClass);
763 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
764 : &X86::VR128RegClass);
765 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
769 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
770 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
771 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
772 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
773 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
774 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
775 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
776 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
777 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
778 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
779 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
780 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
782 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
783 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
784 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
785 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
787 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
788 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
789 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
790 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
792 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
793 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
794 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
795 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
796 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
797 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
799 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
800 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
801 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
802 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
804 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
805 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
806 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
807 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
809 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
810 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
811 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
812 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
813 setOperationAction(ISD::VSELECT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
817 // We support custom legalizing of sext and anyext loads for specific
818 // memory vector types which we can load as a scalar (or sequence of
819 // scalars) and extend in-register to a legal 128-bit vector type. For sext
820 // loads these must work with a single scalar load.
821 for (MVT VT : MVT::integer_vector_valuetypes()) {
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
824 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
833 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
834 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
835 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
836 setOperationAction(ISD::VSELECT, VT, Custom);
838 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
841 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
845 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
846 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
847 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
851 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
854 // Custom lower v2i64 and v2f64 selects.
855 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
856 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
859 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
862 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
868 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
869 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
871 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
874 for (MVT VT : MVT::fp_vector_valuetypes())
875 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
877 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
878 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
879 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
881 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
882 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
883 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
885 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
886 setOperationAction(ISD::SRL, VT, Custom);
887 setOperationAction(ISD::SHL, VT, Custom);
888 setOperationAction(ISD::SRA, VT, Custom);
891 // In the customized shift lowering, the legal cases in AVX2 will be
893 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
894 setOperationAction(ISD::SRL, VT, Custom);
895 setOperationAction(ISD::SHL, VT, Custom);
896 setOperationAction(ISD::SRA, VT, Custom);
900 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
901 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
902 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
903 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
904 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
905 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
906 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
907 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
908 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
911 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
912 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
913 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
914 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
915 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
916 setOperationAction(ISD::FRINT, RoundedTy, Legal);
917 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
920 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
921 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
922 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
923 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
924 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
925 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
926 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
927 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
929 // FIXME: Do we need to handle scalar-to-vector here?
930 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
932 // We directly match byte blends in the backend as they match the VSELECT
934 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
936 // SSE41 brings specific instructions for doing vector sign extend even in
937 // cases where we don't have SRA.
938 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
939 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
940 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
942 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
943 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
944 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
946 for (MVT VT : MVT::integer_vector_valuetypes()) {
947 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
948 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
949 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
952 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
953 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
954 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
955 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
956 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
957 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
958 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
960 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
961 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
962 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
963 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
964 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
965 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
967 // i8 vectors are custom because the source register and source
968 // source memory operand types are not the same width.
969 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
972 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
973 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
974 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
975 setOperationAction(ISD::ROTL, VT, Custom);
977 // XOP can efficiently perform BITREVERSE with VPPERM.
978 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
979 setOperationAction(ISD::BITREVERSE, VT, Custom);
981 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
982 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
983 setOperationAction(ISD::BITREVERSE, VT, Custom);
986 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
987 bool HasInt256 = Subtarget.hasInt256();
989 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
990 : &X86::VR256RegClass);
991 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
992 : &X86::VR256RegClass);
993 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
994 : &X86::VR256RegClass);
995 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1002 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1003 setOperationAction(ISD::FFLOOR, VT, Legal);
1004 setOperationAction(ISD::FCEIL, VT, Legal);
1005 setOperationAction(ISD::FTRUNC, VT, Legal);
1006 setOperationAction(ISD::FRINT, VT, Legal);
1007 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1008 setOperationAction(ISD::FNEG, VT, Custom);
1009 setOperationAction(ISD::FABS, VT, Custom);
1010 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1013 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1014 // even though v8i16 is a legal type.
1015 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1016 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1019 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1020 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1021 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1023 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1024 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1026 for (MVT VT : MVT::fp_vector_valuetypes())
1027 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1029 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1030 setOperationAction(ISD::SRL, VT, Custom);
1031 setOperationAction(ISD::SHL, VT, Custom);
1032 setOperationAction(ISD::SRA, VT, Custom);
1035 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1036 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1037 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1038 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1040 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1041 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1042 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1044 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1045 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1046 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1047 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1048 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1049 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1050 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1051 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1052 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1053 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1054 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1055 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1056 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1058 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1059 setOperationAction(ISD::CTPOP, VT, Custom);
1060 setOperationAction(ISD::CTTZ, VT, Custom);
1061 setOperationAction(ISD::CTLZ, VT, Custom);
1064 if (Subtarget.hasAnyFMA()) {
1065 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1066 MVT::v2f64, MVT::v4f64 })
1067 setOperationAction(ISD::FMA, VT, Legal);
1070 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1071 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1076 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1077 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1078 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1080 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1081 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1083 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1085 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1086 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1088 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1089 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1090 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1091 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1092 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1097 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1098 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1099 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1101 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1102 // when we have a 256bit-wide blend with immediate.
1103 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1105 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1106 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1107 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1108 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1109 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1110 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1111 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1113 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1114 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1115 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1116 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1117 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1118 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1121 // In the customized shift lowering, the legal cases in AVX2 will be
1123 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1124 setOperationAction(ISD::SRL, VT, Custom);
1125 setOperationAction(ISD::SHL, VT, Custom);
1126 setOperationAction(ISD::SRA, VT, Custom);
1129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1130 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1131 setOperationAction(ISD::MLOAD, VT, Legal);
1132 setOperationAction(ISD::MSTORE, VT, Legal);
1135 // Extract subvector is special because the value type
1136 // (result) is 128-bit but the source is 256-bit wide.
1137 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1138 MVT::v4f32, MVT::v2f64 }) {
1139 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1142 // Custom lower several nodes for 256-bit types.
1143 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1144 MVT::v8f32, MVT::v4f64 }) {
1145 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1146 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1147 setOperationAction(ISD::VSELECT, VT, Custom);
1148 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1149 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1151 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1152 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1156 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1158 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1159 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1160 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1161 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1168 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1169 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1170 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1171 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1172 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1174 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1175 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1176 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1178 for (MVT VT : MVT::fp_vector_valuetypes())
1179 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1181 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1182 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1183 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1184 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1185 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1186 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1187 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1189 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1190 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1191 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1192 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1193 setOperationAction(ISD::XOR, MVT::i1, Legal);
1194 setOperationAction(ISD::OR, MVT::i1, Legal);
1195 setOperationAction(ISD::AND, MVT::i1, Legal);
1196 setOperationAction(ISD::SUB, MVT::i1, Custom);
1197 setOperationAction(ISD::ADD, MVT::i1, Custom);
1198 setOperationAction(ISD::MUL, MVT::i1, Custom);
1200 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1201 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1202 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1203 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1204 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1205 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1206 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1207 setTruncStoreAction(VT, MaskVT, Custom);
1210 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1211 setOperationAction(ISD::FNEG, VT, Custom);
1212 setOperationAction(ISD::FABS, VT, Custom);
1213 setOperationAction(ISD::FMA, VT, Legal);
1214 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1217 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1218 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1219 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1220 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1221 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1222 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1223 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1224 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1225 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1226 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1227 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1228 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1229 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1230 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1231 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1232 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1233 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1234 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1235 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1236 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1237 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1238 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1239 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1240 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1241 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1243 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1244 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1245 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1247 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1248 if (Subtarget.hasVLX()){
1249 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1250 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1251 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1252 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1253 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1255 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1256 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1257 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1259 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1261 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1262 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1263 setOperationAction(ISD::MLOAD, VT, Custom);
1264 setOperationAction(ISD::MSTORE, VT, Custom);
1267 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1268 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1269 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1270 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1271 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1272 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1273 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1274 if (Subtarget.hasDQI()) {
1275 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1276 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1277 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1278 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1279 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1280 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1281 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1282 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1283 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1284 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1285 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1286 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1288 if (Subtarget.hasVLX()) {
1289 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1290 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1291 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1292 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1295 if (Subtarget.hasVLX()) {
1296 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1297 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1298 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1299 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1300 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1301 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1302 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1303 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1304 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1305 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1306 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1310 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1311 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1312 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1313 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1314 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1316 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1317 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1319 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1320 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1323 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1324 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1325 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1327 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1328 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1329 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1330 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1331 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1332 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1333 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1334 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1336 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1337 setOperationAction(ISD::FFLOOR, VT, Legal);
1338 setOperationAction(ISD::FCEIL, VT, Legal);
1339 setOperationAction(ISD::FTRUNC, VT, Legal);
1340 setOperationAction(ISD::FRINT, VT, Legal);
1341 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1344 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1345 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1347 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1348 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1349 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1351 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1352 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1357 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1358 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1360 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1362 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1363 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1364 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1365 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1366 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1367 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1368 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1369 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1370 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1371 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1372 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1373 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1375 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1376 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1377 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1378 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1379 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1380 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1381 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1382 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1384 setOperationAction(ISD::ADD, MVT::v8i1, Custom);
1385 setOperationAction(ISD::ADD, MVT::v16i1, Custom);
1386 setOperationAction(ISD::SUB, MVT::v8i1, Custom);
1387 setOperationAction(ISD::SUB, MVT::v16i1, Custom);
1388 setOperationAction(ISD::MUL, MVT::v8i1, Custom);
1389 setOperationAction(ISD::MUL, MVT::v16i1, Custom);
1391 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1393 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1394 setOperationAction(ISD::ABS, VT, Legal);
1395 setOperationAction(ISD::SRL, VT, Custom);
1396 setOperationAction(ISD::SHL, VT, Custom);
1397 setOperationAction(ISD::SRA, VT, Custom);
1398 setOperationAction(ISD::CTPOP, VT, Custom);
1399 setOperationAction(ISD::CTTZ, VT, Custom);
1402 // Need to promote to 64-bit even though we have 32-bit masked instructions
1403 // because the IR optimizers rearrange bitcasts around logic ops leaving
1404 // too many variations to handle if we don't promote them.
1405 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1406 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1407 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1409 if (Subtarget.hasCDI()) {
1410 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1411 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1413 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1414 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1415 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1416 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1418 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1419 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1421 if (Subtarget.hasVLX()) {
1422 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1423 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1424 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1425 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1427 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1428 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1429 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1430 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1433 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1434 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1435 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1436 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1437 } // Subtarget.hasCDI()
1439 if (Subtarget.hasDQI()) {
1440 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1441 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1442 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1443 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1446 // Custom lower several nodes.
1447 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1448 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1449 setOperationAction(ISD::MGATHER, VT, Custom);
1450 setOperationAction(ISD::MSCATTER, VT, Custom);
1452 // Extract subvector is special because the value type
1453 // (result) is 256-bit but the source is 512-bit wide.
1454 // 128-bit was made Custom under AVX1.
1455 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1456 MVT::v8f32, MVT::v4f64 })
1457 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1458 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1459 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1460 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1462 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1463 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1464 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1465 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1466 setOperationAction(ISD::VSELECT, VT, Legal);
1467 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1468 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1469 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1470 setOperationAction(ISD::MLOAD, VT, Legal);
1471 setOperationAction(ISD::MSTORE, VT, Legal);
1472 setOperationAction(ISD::MGATHER, VT, Legal);
1473 setOperationAction(ISD::MSCATTER, VT, Custom);
1475 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1476 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1477 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1481 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1482 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1483 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1489 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1490 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1491 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1492 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1493 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1495 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1496 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1497 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1498 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1499 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1500 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1501 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1508 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1509 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1510 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1511 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1512 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1513 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1514 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1515 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1516 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1517 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1518 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1519 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1520 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1521 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1522 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1523 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1527 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1528 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1529 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1530 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1531 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1532 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1533 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1534 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1535 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1536 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1537 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1538 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1539 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1540 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1541 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1543 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1544 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1545 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1546 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1547 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1548 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1549 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1550 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1555 if (Subtarget.hasVLX()) {
1556 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1557 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1560 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1561 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1562 setOperationAction(ISD::MLOAD, VT, Action);
1563 setOperationAction(ISD::MSTORE, VT, Action);
1566 if (Subtarget.hasCDI()) {
1567 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1568 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1571 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1572 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1573 setOperationAction(ISD::VSELECT, VT, Legal);
1574 setOperationAction(ISD::ABS, VT, Legal);
1575 setOperationAction(ISD::SRL, VT, Custom);
1576 setOperationAction(ISD::SHL, VT, Custom);
1577 setOperationAction(ISD::SRA, VT, Custom);
1578 setOperationAction(ISD::MLOAD, VT, Legal);
1579 setOperationAction(ISD::MSTORE, VT, Legal);
1580 setOperationAction(ISD::CTPOP, VT, Custom);
1581 setOperationAction(ISD::CTTZ, VT, Custom);
1583 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1584 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1585 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1588 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1589 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1590 if (Subtarget.hasVLX()) {
1591 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1592 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1593 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1598 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1599 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1600 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1602 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1603 setOperationAction(ISD::ADD, VT, Custom);
1604 setOperationAction(ISD::SUB, VT, Custom);
1605 setOperationAction(ISD::MUL, VT, Custom);
1606 setOperationAction(ISD::VSELECT, VT, Expand);
1608 setOperationAction(ISD::TRUNCATE, VT, Custom);
1609 setOperationAction(ISD::SETCC, VT, Custom);
1610 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1611 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1612 setOperationAction(ISD::SELECT, VT, Custom);
1613 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1614 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1617 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1618 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1619 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1620 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1622 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1623 setOperationAction(ISD::SMAX, VT, Legal);
1624 setOperationAction(ISD::UMAX, VT, Legal);
1625 setOperationAction(ISD::SMIN, VT, Legal);
1626 setOperationAction(ISD::UMIN, VT, Legal);
1630 // We want to custom lower some of our intrinsics.
1631 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1632 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1633 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1634 if (!Subtarget.is64Bit()) {
1635 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1639 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1640 // handle type legalization for these operations here.
1642 // FIXME: We really should do custom legalization for addition and
1643 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1644 // than generic legalization for 64-bit multiplication-with-overflow, though.
1645 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1646 if (VT == MVT::i64 && !Subtarget.is64Bit())
1648 // Add/Sub/Mul with overflow operations are custom lowered.
1649 setOperationAction(ISD::SADDO, VT, Custom);
1650 setOperationAction(ISD::UADDO, VT, Custom);
1651 setOperationAction(ISD::SSUBO, VT, Custom);
1652 setOperationAction(ISD::USUBO, VT, Custom);
1653 setOperationAction(ISD::SMULO, VT, Custom);
1654 setOperationAction(ISD::UMULO, VT, Custom);
1657 if (!Subtarget.is64Bit()) {
1658 // These libcalls are not available in 32-bit.
1659 setLibcallName(RTLIB::SHL_I128, nullptr);
1660 setLibcallName(RTLIB::SRL_I128, nullptr);
1661 setLibcallName(RTLIB::SRA_I128, nullptr);
1664 // Combine sin / cos into one node or libcall if possible.
1665 if (Subtarget.hasSinCos()) {
1666 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1667 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1668 if (Subtarget.isTargetDarwin()) {
1669 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1670 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1671 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1672 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1676 if (Subtarget.isTargetWin64()) {
1677 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1678 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1679 setOperationAction(ISD::SREM, MVT::i128, Custom);
1680 setOperationAction(ISD::UREM, MVT::i128, Custom);
1681 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1682 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1685 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1686 // is. We should promote the value to 64-bits to solve this.
1687 // This is what the CRT headers do - `fmodf` is an inline header
1688 // function casting to f64 and calling `fmod`.
1689 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1690 Subtarget.isTargetWindowsItanium()))
1691 for (ISD::NodeType Op :
1692 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1693 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1694 if (isOperationExpand(Op, MVT::f32))
1695 setOperationAction(Op, MVT::f32, Promote);
1697 // We have target-specific dag combine patterns for the following nodes:
1698 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1699 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1700 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1701 setTargetDAGCombine(ISD::BITCAST);
1702 setTargetDAGCombine(ISD::VSELECT);
1703 setTargetDAGCombine(ISD::SELECT);
1704 setTargetDAGCombine(ISD::SHL);
1705 setTargetDAGCombine(ISD::SRA);
1706 setTargetDAGCombine(ISD::SRL);
1707 setTargetDAGCombine(ISD::OR);
1708 setTargetDAGCombine(ISD::AND);
1709 setTargetDAGCombine(ISD::ADD);
1710 setTargetDAGCombine(ISD::FADD);
1711 setTargetDAGCombine(ISD::FSUB);
1712 setTargetDAGCombine(ISD::FNEG);
1713 setTargetDAGCombine(ISD::FMA);
1714 setTargetDAGCombine(ISD::FMINNUM);
1715 setTargetDAGCombine(ISD::FMAXNUM);
1716 setTargetDAGCombine(ISD::SUB);
1717 setTargetDAGCombine(ISD::LOAD);
1718 setTargetDAGCombine(ISD::MLOAD);
1719 setTargetDAGCombine(ISD::STORE);
1720 setTargetDAGCombine(ISD::MSTORE);
1721 setTargetDAGCombine(ISD::TRUNCATE);
1722 setTargetDAGCombine(ISD::ZERO_EXTEND);
1723 setTargetDAGCombine(ISD::ANY_EXTEND);
1724 setTargetDAGCombine(ISD::SIGN_EXTEND);
1725 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1726 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1727 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1728 setTargetDAGCombine(ISD::SINT_TO_FP);
1729 setTargetDAGCombine(ISD::UINT_TO_FP);
1730 setTargetDAGCombine(ISD::SETCC);
1731 setTargetDAGCombine(ISD::MUL);
1732 setTargetDAGCombine(ISD::XOR);
1733 setTargetDAGCombine(ISD::MSCATTER);
1734 setTargetDAGCombine(ISD::MGATHER);
1736 computeRegisterProperties(Subtarget.getRegisterInfo());
1738 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1739 MaxStoresPerMemsetOptSize = 8;
1740 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1741 MaxStoresPerMemcpyOptSize = 4;
1742 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1743 MaxStoresPerMemmoveOptSize = 4;
1744 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1745 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1747 // An out-of-order CPU can speculatively execute past a predictable branch,
1748 // but a conditional move could be stalled by an expensive earlier operation.
1749 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1750 EnableExtLdPromotion = true;
1751 setPrefFunctionAlignment(4); // 2^4 bytes.
1753 verifyIntrinsicTables();
1756 // This has so far only been implemented for 64-bit MachO.
1757 bool X86TargetLowering::useLoadStackGuardNode() const {
1758 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1761 TargetLoweringBase::LegalizeTypeAction
1762 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1763 if (ExperimentalVectorWideningLegalization &&
1764 VT.getVectorNumElements() != 1 &&
1765 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1766 return TypeWidenVector;
1768 return TargetLoweringBase::getPreferredVectorAction(VT);
1771 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1772 LLVMContext& Context,
1775 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1777 if (VT.isSimple()) {
1778 MVT VVT = VT.getSimpleVT();
1779 const unsigned NumElts = VVT.getVectorNumElements();
1780 MVT EltVT = VVT.getVectorElementType();
1781 if (VVT.is512BitVector()) {
1782 if (Subtarget.hasAVX512())
1783 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1784 EltVT == MVT::f32 || EltVT == MVT::f64)
1786 case 8: return MVT::v8i1;
1787 case 16: return MVT::v16i1;
1789 if (Subtarget.hasBWI())
1790 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1792 case 32: return MVT::v32i1;
1793 case 64: return MVT::v64i1;
1797 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1798 return MVT::getVectorVT(MVT::i1, NumElts);
1800 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1801 EVT LegalVT = getTypeToTransformTo(Context, VT);
1802 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1805 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1807 case 2: return MVT::v2i1;
1808 case 4: return MVT::v4i1;
1809 case 8: return MVT::v8i1;
1813 return VT.changeVectorElementTypeToInteger();
1816 /// Helper for getByValTypeAlignment to determine
1817 /// the desired ByVal argument alignment.
1818 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1821 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1822 if (VTy->getBitWidth() == 128)
1824 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1825 unsigned EltAlign = 0;
1826 getMaxByValAlign(ATy->getElementType(), EltAlign);
1827 if (EltAlign > MaxAlign)
1828 MaxAlign = EltAlign;
1829 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1830 for (auto *EltTy : STy->elements()) {
1831 unsigned EltAlign = 0;
1832 getMaxByValAlign(EltTy, EltAlign);
1833 if (EltAlign > MaxAlign)
1834 MaxAlign = EltAlign;
1841 /// Return the desired alignment for ByVal aggregate
1842 /// function arguments in the caller parameter area. For X86, aggregates
1843 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1844 /// are at 4-byte boundaries.
1845 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1846 const DataLayout &DL) const {
1847 if (Subtarget.is64Bit()) {
1848 // Max of 8 and alignment of type.
1849 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1856 if (Subtarget.hasSSE1())
1857 getMaxByValAlign(Ty, Align);
1861 /// Returns the target specific optimal type for load
1862 /// and store operations as a result of memset, memcpy, and memmove
1863 /// lowering. If DstAlign is zero that means it's safe to destination
1864 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1865 /// means there isn't a need to check it against alignment requirement,
1866 /// probably because the source does not need to be loaded. If 'IsMemset' is
1867 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1868 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1869 /// source is constant so it does not need to be loaded.
1870 /// It returns EVT::Other if the type should be determined using generic
1871 /// target-independent logic.
1873 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1874 unsigned DstAlign, unsigned SrcAlign,
1875 bool IsMemset, bool ZeroMemset,
1877 MachineFunction &MF) const {
1878 const Function *F = MF.getFunction();
1879 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1881 (!Subtarget.isUnalignedMem16Slow() ||
1882 ((DstAlign == 0 || DstAlign >= 16) &&
1883 (SrcAlign == 0 || SrcAlign >= 16)))) {
1884 // FIXME: Check if unaligned 32-byte accesses are slow.
1885 if (Size >= 32 && Subtarget.hasAVX()) {
1886 // Although this isn't a well-supported type for AVX1, we'll let
1887 // legalization and shuffle lowering produce the optimal codegen. If we
1888 // choose an optimal type with a vector element larger than a byte,
1889 // getMemsetStores() may create an intermediate splat (using an integer
1890 // multiply) before we splat as a vector.
1893 if (Subtarget.hasSSE2())
1895 // TODO: Can SSE1 handle a byte vector?
1896 if (Subtarget.hasSSE1())
1898 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1899 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1900 // Do not use f64 to lower memcpy if source is string constant. It's
1901 // better to use i32 to avoid the loads.
1902 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1903 // The gymnastics of splatting a byte value into an XMM register and then
1904 // only using 8-byte stores (because this is a CPU with slow unaligned
1905 // 16-byte accesses) makes that a loser.
1909 // This is a compromise. If we reach here, unaligned accesses may be slow on
1910 // this target. However, creating smaller, aligned accesses could be even
1911 // slower and would certainly be a lot more code.
1912 if (Subtarget.is64Bit() && Size >= 8)
1917 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1919 return X86ScalarSSEf32;
1920 else if (VT == MVT::f64)
1921 return X86ScalarSSEf64;
1926 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1931 switch (VT.getSizeInBits()) {
1933 // 8-byte and under are always assumed to be fast.
1937 *Fast = !Subtarget.isUnalignedMem16Slow();
1940 *Fast = !Subtarget.isUnalignedMem32Slow();
1942 // TODO: What about AVX-512 (512-bit) accesses?
1945 // Misaligned accesses of any size are always allowed.
1949 /// Return the entry encoding for a jump table in the
1950 /// current function. The returned value is a member of the
1951 /// MachineJumpTableInfo::JTEntryKind enum.
1952 unsigned X86TargetLowering::getJumpTableEncoding() const {
1953 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1955 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1956 return MachineJumpTableInfo::EK_Custom32;
1958 // Otherwise, use the normal jump table encoding heuristics.
1959 return TargetLowering::getJumpTableEncoding();
1962 bool X86TargetLowering::useSoftFloat() const {
1963 return Subtarget.useSoftFloat();
1966 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1967 ArgListTy &Args) const {
1969 // Only relabel X86-32 for C / Stdcall CCs.
1970 if (Subtarget.is64Bit())
1972 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1974 unsigned ParamRegs = 0;
1975 if (auto *M = MF->getFunction()->getParent())
1976 ParamRegs = M->getNumberRegisterParameters();
1978 // Mark the first N int arguments as having reg
1979 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1980 Type *T = Args[Idx].Ty;
1981 if (T->isPointerTy() || T->isIntegerTy())
1982 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1983 unsigned numRegs = 1;
1984 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1986 if (ParamRegs < numRegs)
1988 ParamRegs -= numRegs;
1989 Args[Idx].IsInReg = true;
1995 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1996 const MachineBasicBlock *MBB,
1997 unsigned uid,MCContext &Ctx) const{
1998 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1999 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2001 return MCSymbolRefExpr::create(MBB->getSymbol(),
2002 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2005 /// Returns relocation base for the given PIC jumptable.
2006 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2007 SelectionDAG &DAG) const {
2008 if (!Subtarget.is64Bit())
2009 // This doesn't have SDLoc associated with it, but is not really the
2010 // same as a Register.
2011 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2012 getPointerTy(DAG.getDataLayout()));
2016 /// This returns the relocation base for the given PIC jumptable,
2017 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2018 const MCExpr *X86TargetLowering::
2019 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2020 MCContext &Ctx) const {
2021 // X86-64 uses RIP relative addressing based on the jump table label.
2022 if (Subtarget.isPICStyleRIPRel())
2023 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2025 // Otherwise, the reference is relative to the PIC base.
2026 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2029 std::pair<const TargetRegisterClass *, uint8_t>
2030 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2032 const TargetRegisterClass *RRC = nullptr;
2034 switch (VT.SimpleTy) {
2036 return TargetLowering::findRepresentativeClass(TRI, VT);
2037 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2038 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2041 RRC = &X86::VR64RegClass;
2043 case MVT::f32: case MVT::f64:
2044 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2045 case MVT::v4f32: case MVT::v2f64:
2046 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2047 case MVT::v8f32: case MVT::v4f64:
2048 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2049 case MVT::v16f32: case MVT::v8f64:
2050 RRC = &X86::VR128XRegClass;
2053 return std::make_pair(RRC, Cost);
2056 unsigned X86TargetLowering::getAddressSpace() const {
2057 if (Subtarget.is64Bit())
2058 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2062 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2063 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2064 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2067 static Constant* SegmentOffset(IRBuilder<> &IRB,
2068 unsigned Offset, unsigned AddressSpace) {
2069 return ConstantExpr::getIntToPtr(
2070 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2071 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2074 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2075 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2076 // tcbhead_t; use it instead of the usual global variable (see
2077 // sysdeps/{i386,x86_64}/nptl/tls.h)
2078 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2079 if (Subtarget.isTargetFuchsia()) {
2080 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2081 return SegmentOffset(IRB, 0x10, getAddressSpace());
2083 // %fs:0x28, unless we're using a Kernel code model, in which case
2084 // it's %gs:0x28. gs:0x14 on i386.
2085 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2086 return SegmentOffset(IRB, Offset, getAddressSpace());
2090 return TargetLowering::getIRStackGuard(IRB);
2093 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2094 // MSVC CRT provides functionalities for stack protection.
2095 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2096 // MSVC CRT has a global variable holding security cookie.
2097 M.getOrInsertGlobal("__security_cookie",
2098 Type::getInt8PtrTy(M.getContext()));
2100 // MSVC CRT has a function to validate security cookie.
2101 auto *SecurityCheckCookie = cast<Function>(
2102 M.getOrInsertFunction("__security_check_cookie",
2103 Type::getVoidTy(M.getContext()),
2104 Type::getInt8PtrTy(M.getContext())));
2105 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2106 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2109 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2110 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2112 TargetLowering::insertSSPDeclarations(M);
2115 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2116 // MSVC CRT has a global variable holding security cookie.
2117 if (Subtarget.getTargetTriple().isOSMSVCRT())
2118 return M.getGlobalVariable("__security_cookie");
2119 return TargetLowering::getSDagStackGuard(M);
2122 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2123 // MSVC CRT has a function to validate security cookie.
2124 if (Subtarget.getTargetTriple().isOSMSVCRT())
2125 return M.getFunction("__security_check_cookie");
2126 return TargetLowering::getSSPStackGuardCheck(M);
2129 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2130 if (Subtarget.getTargetTriple().isOSContiki())
2131 return getDefaultSafeStackPointerLocation(IRB, false);
2133 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2134 // definition of TLS_SLOT_SAFESTACK in
2135 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2136 if (Subtarget.isTargetAndroid()) {
2137 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2139 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2140 return SegmentOffset(IRB, Offset, getAddressSpace());
2143 // Fuchsia is similar.
2144 if (Subtarget.isTargetFuchsia()) {
2145 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2146 return SegmentOffset(IRB, 0x18, getAddressSpace());
2149 return TargetLowering::getSafeStackPointerLocation(IRB);
2152 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2153 unsigned DestAS) const {
2154 assert(SrcAS != DestAS && "Expected different address spaces!");
2156 return SrcAS < 256 && DestAS < 256;
2159 //===----------------------------------------------------------------------===//
2160 // Return Value Calling Convention Implementation
2161 //===----------------------------------------------------------------------===//
2163 #include "X86GenCallingConv.inc"
2165 bool X86TargetLowering::CanLowerReturn(
2166 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2167 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2168 SmallVector<CCValAssign, 16> RVLocs;
2169 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2170 return CCInfo.CheckReturn(Outs, RetCC_X86);
2173 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2174 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2178 /// Lowers masks values (v*i1) to the local register values
2179 /// \returns DAG node after lowering to register type
2180 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2181 const SDLoc &Dl, SelectionDAG &DAG) {
2182 EVT ValVT = ValArg.getValueType();
2184 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2185 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2186 // Two stage lowering might be required
2187 // bitcast: v8i1 -> i8 / v16i1 -> i16
2188 // anyextend: i8 -> i32 / i16 -> i32
2189 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2190 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2191 if (ValLoc == MVT::i32)
2192 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2194 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2195 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2196 // One stage lowering is required
2197 // bitcast: v32i1 -> i32 / v64i1 -> i64
2198 return DAG.getBitcast(ValLoc, ValArg);
2200 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2203 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2204 static void Passv64i1ArgInRegs(
2205 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2206 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2207 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2208 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2209 "Expected AVX512BW or AVX512BMI target!");
2210 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2211 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2212 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2213 "The value should reside in two registers");
2215 // Before splitting the value we cast it to i64
2216 Arg = DAG.getBitcast(MVT::i64, Arg);
2218 // Splitting the value into two i32 types
2220 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2221 DAG.getConstant(0, Dl, MVT::i32));
2222 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2223 DAG.getConstant(1, Dl, MVT::i32));
2225 // Attach the two i32 types into corresponding registers
2226 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2227 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2231 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2233 const SmallVectorImpl<ISD::OutputArg> &Outs,
2234 const SmallVectorImpl<SDValue> &OutVals,
2235 const SDLoc &dl, SelectionDAG &DAG) const {
2236 MachineFunction &MF = DAG.getMachineFunction();
2237 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2239 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2240 report_fatal_error("X86 interrupts may not return any value");
2242 SmallVector<CCValAssign, 16> RVLocs;
2243 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2244 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2247 SmallVector<SDValue, 6> RetOps;
2248 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2249 // Operand #1 = Bytes To Pop
2250 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2253 // Copy the result values into the output registers.
2254 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2256 CCValAssign &VA = RVLocs[I];
2257 assert(VA.isRegLoc() && "Can only return in registers!");
2259 // Add the register to the CalleeSaveDisableRegs list.
2260 if (CallConv == CallingConv::X86_RegCall)
2261 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2263 SDValue ValToCopy = OutVals[OutsIndex];
2264 EVT ValVT = ValToCopy.getValueType();
2266 // Promote values to the appropriate types.
2267 if (VA.getLocInfo() == CCValAssign::SExt)
2268 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2269 else if (VA.getLocInfo() == CCValAssign::ZExt)
2270 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2271 else if (VA.getLocInfo() == CCValAssign::AExt) {
2272 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2273 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2275 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2277 else if (VA.getLocInfo() == CCValAssign::BCvt)
2278 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2280 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2281 "Unexpected FP-extend for return value.");
2283 // If this is x86-64, and we disabled SSE, we can't return FP values,
2284 // or SSE or MMX vectors.
2285 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2286 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2287 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2288 report_fatal_error("SSE register return with SSE disabled");
2290 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2291 // llvm-gcc has never done it right and no one has noticed, so this
2292 // should be OK for now.
2293 if (ValVT == MVT::f64 &&
2294 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2295 report_fatal_error("SSE2 register return with SSE2 disabled");
2297 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2298 // the RET instruction and handled by the FP Stackifier.
2299 if (VA.getLocReg() == X86::FP0 ||
2300 VA.getLocReg() == X86::FP1) {
2301 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2302 // change the value to the FP stack register class.
2303 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2304 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2305 RetOps.push_back(ValToCopy);
2306 // Don't emit a copytoreg.
2310 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2311 // which is returned in RAX / RDX.
2312 if (Subtarget.is64Bit()) {
2313 if (ValVT == MVT::x86mmx) {
2314 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2315 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2316 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2318 // If we don't have SSE2 available, convert to v4f32 so the generated
2319 // register is legal.
2320 if (!Subtarget.hasSSE2())
2321 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2326 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2328 if (VA.needsCustom()) {
2329 assert(VA.getValVT() == MVT::v64i1 &&
2330 "Currently the only custom case is when we split v64i1 to 2 regs");
2332 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2335 assert(2 == RegsToPass.size() &&
2336 "Expecting two registers after Pass64BitArgInRegs");
2338 // Add the second register to the CalleeSaveDisableRegs list.
2339 if (CallConv == CallingConv::X86_RegCall)
2340 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2342 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2345 // Add nodes to the DAG and add the values into the RetOps list
2346 for (auto &Reg : RegsToPass) {
2347 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2348 Flag = Chain.getValue(1);
2349 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2353 // Swift calling convention does not require we copy the sret argument
2354 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2356 // All x86 ABIs require that for returning structs by value we copy
2357 // the sret argument into %rax/%eax (depending on ABI) for the return.
2358 // We saved the argument into a virtual register in the entry block,
2359 // so now we copy the value out and into %rax/%eax.
2361 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2362 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2363 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2364 // either case FuncInfo->setSRetReturnReg() will have been called.
2365 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2366 // When we have both sret and another return value, we should use the
2367 // original Chain stored in RetOps[0], instead of the current Chain updated
2368 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2370 // For the case of sret and another return value, we have
2371 // Chain_0 at the function entry
2372 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2373 // If we use Chain_1 in getCopyFromReg, we will have
2374 // Val = getCopyFromReg(Chain_1)
2375 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2377 // getCopyToReg(Chain_0) will be glued together with
2378 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2379 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2380 // Data dependency from Unit B to Unit A due to usage of Val in
2381 // getCopyToReg(Chain_1, Val)
2382 // Chain dependency from Unit A to Unit B
2384 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2385 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2386 getPointerTy(MF.getDataLayout()));
2389 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2390 X86::RAX : X86::EAX;
2391 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2392 Flag = Chain.getValue(1);
2394 // RAX/EAX now acts like a return value.
2396 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2398 // Add the returned register to the CalleeSaveDisableRegs list.
2399 if (CallConv == CallingConv::X86_RegCall)
2400 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2403 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2404 const MCPhysReg *I =
2405 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2408 if (X86::GR64RegClass.contains(*I))
2409 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2411 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2415 RetOps[0] = Chain; // Update chain.
2417 // Add the flag if we have it.
2419 RetOps.push_back(Flag);
2421 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2422 if (CallConv == CallingConv::X86_INTR)
2423 opcode = X86ISD::IRET;
2424 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2427 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2428 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2431 SDValue TCChain = Chain;
2432 SDNode *Copy = *N->use_begin();
2433 if (Copy->getOpcode() == ISD::CopyToReg) {
2434 // If the copy has a glue operand, we conservatively assume it isn't safe to
2435 // perform a tail call.
2436 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2438 TCChain = Copy->getOperand(0);
2439 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2442 bool HasRet = false;
2443 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2445 if (UI->getOpcode() != X86ISD::RET_FLAG)
2447 // If we are returning more than one value, we can definitely
2448 // not make a tail call see PR19530
2449 if (UI->getNumOperands() > 4)
2451 if (UI->getNumOperands() == 4 &&
2452 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2464 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2465 ISD::NodeType ExtendKind) const {
2466 MVT ReturnMVT = MVT::i32;
2468 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2469 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2470 // The ABI does not require i1, i8 or i16 to be extended.
2472 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2473 // always extending i8/i16 return values, so keep doing that for now.
2475 ReturnMVT = MVT::i8;
2478 EVT MinVT = getRegisterType(Context, ReturnMVT);
2479 return VT.bitsLT(MinVT) ? MinVT : VT;
2482 /// Reads two 32 bit registers and creates a 64 bit mask value.
2483 /// \param VA The current 32 bit value that need to be assigned.
2484 /// \param NextVA The next 32 bit value that need to be assigned.
2485 /// \param Root The parent DAG node.
2486 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2487 /// glue purposes. In the case the DAG is already using
2488 /// physical register instead of virtual, we should glue
2489 /// our new SDValue to InFlag SDvalue.
2490 /// \return a new SDvalue of size 64bit.
2491 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2492 SDValue &Root, SelectionDAG &DAG,
2493 const SDLoc &Dl, const X86Subtarget &Subtarget,
2494 SDValue *InFlag = nullptr) {
2495 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2496 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2497 assert(VA.getValVT() == MVT::v64i1 &&
2498 "Expecting first location of 64 bit width type");
2499 assert(NextVA.getValVT() == VA.getValVT() &&
2500 "The locations should have the same type");
2501 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2502 "The values should reside in two registers");
2506 SDValue ArgValueLo, ArgValueHi;
2508 MachineFunction &MF = DAG.getMachineFunction();
2509 const TargetRegisterClass *RC = &X86::GR32RegClass;
2511 // Read a 32 bit value from the registers
2512 if (nullptr == InFlag) {
2513 // When no physical register is present,
2514 // create an intermediate virtual register
2515 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2516 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2517 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2518 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2520 // When a physical register is available read the value from it and glue
2521 // the reads together.
2523 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2524 *InFlag = ArgValueLo.getValue(2);
2526 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2527 *InFlag = ArgValueHi.getValue(2);
2530 // Convert the i32 type into v32i1 type
2531 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2533 // Convert the i32 type into v32i1 type
2534 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2536 // Concatenate the two values together
2537 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2540 /// The function will lower a register of various sizes (8/16/32/64)
2541 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2542 /// \returns a DAG node contains the operand after lowering to mask type.
2543 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2544 const EVT &ValLoc, const SDLoc &Dl,
2545 SelectionDAG &DAG) {
2546 SDValue ValReturned = ValArg;
2548 if (ValVT == MVT::v64i1) {
2549 // In 32 bit machine, this case is handled by getv64i1Argument
2550 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2551 // In 64 bit machine, There is no need to truncate the value only bitcast
2554 switch (ValVT.getSimpleVT().SimpleTy) {
2565 llvm_unreachable("Expecting a vector of i1 types");
2568 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2571 return DAG.getBitcast(ValVT, ValReturned);
2574 /// Lower the result values of a call into the
2575 /// appropriate copies out of appropriate physical registers.
2577 SDValue X86TargetLowering::LowerCallResult(
2578 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2579 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2580 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2581 uint32_t *RegMask) const {
2583 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2584 // Assign locations to each value returned by this call.
2585 SmallVector<CCValAssign, 16> RVLocs;
2586 bool Is64Bit = Subtarget.is64Bit();
2587 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2589 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2591 // Copy all of the result registers out of their specified physreg.
2592 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2594 CCValAssign &VA = RVLocs[I];
2595 EVT CopyVT = VA.getLocVT();
2597 // In some calling conventions we need to remove the used registers
2598 // from the register mask.
2599 if (RegMask && CallConv == CallingConv::X86_RegCall) {
2600 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2601 SubRegs.isValid(); ++SubRegs)
2602 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2605 // If this is x86-64, and we disabled SSE, we can't return FP values
2606 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2607 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2608 report_fatal_error("SSE register return with SSE disabled");
2611 // If we prefer to use the value in xmm registers, copy it out as f80 and
2612 // use a truncate to move it from fp stack reg to xmm reg.
2613 bool RoundAfterCopy = false;
2614 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2615 isScalarFPTypeInSSEReg(VA.getValVT())) {
2616 if (!Subtarget.hasX87())
2617 report_fatal_error("X87 register return with X87 disabled");
2619 RoundAfterCopy = (CopyVT != VA.getLocVT());
2623 if (VA.needsCustom()) {
2624 assert(VA.getValVT() == MVT::v64i1 &&
2625 "Currently the only custom case is when we split v64i1 to 2 regs");
2627 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2629 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2631 Val = Chain.getValue(0);
2632 InFlag = Chain.getValue(2);
2636 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2637 // This truncation won't change the value.
2638 DAG.getIntPtrConstant(1, dl));
2640 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2641 if (VA.getValVT().isVector() &&
2642 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2643 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2644 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2645 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2647 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2650 InVals.push_back(Val);
2656 //===----------------------------------------------------------------------===//
2657 // C & StdCall & Fast Calling Convention implementation
2658 //===----------------------------------------------------------------------===//
2659 // StdCall calling convention seems to be standard for many Windows' API
2660 // routines and around. It differs from C calling convention just a little:
2661 // callee should clean up the stack, not caller. Symbols should be also
2662 // decorated in some fancy way :) It doesn't support any vector arguments.
2663 // For info on fast calling convention see Fast Calling Convention (tail call)
2664 // implementation LowerX86_32FastCCCallTo.
2666 /// CallIsStructReturn - Determines whether a call uses struct return
2668 enum StructReturnType {
2673 static StructReturnType
2674 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2676 return NotStructReturn;
2678 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2679 if (!Flags.isSRet())
2680 return NotStructReturn;
2681 if (Flags.isInReg() || IsMCU)
2682 return RegStructReturn;
2683 return StackStructReturn;
2686 /// Determines whether a function uses struct return semantics.
2687 static StructReturnType
2688 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2690 return NotStructReturn;
2692 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2693 if (!Flags.isSRet())
2694 return NotStructReturn;
2695 if (Flags.isInReg() || IsMCU)
2696 return RegStructReturn;
2697 return StackStructReturn;
2700 /// Make a copy of an aggregate at address specified by "Src" to address
2701 /// "Dst" with size and alignment information specified by the specific
2702 /// parameter attribute. The copy will be passed as a byval function parameter.
2703 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2704 SDValue Chain, ISD::ArgFlagsTy Flags,
2705 SelectionDAG &DAG, const SDLoc &dl) {
2706 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2708 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2709 /*isVolatile*/false, /*AlwaysInline=*/true,
2710 /*isTailCall*/false,
2711 MachinePointerInfo(), MachinePointerInfo());
2714 /// Return true if the calling convention is one that we can guarantee TCO for.
2715 static bool canGuaranteeTCO(CallingConv::ID CC) {
2716 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2717 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2718 CC == CallingConv::HHVM);
2721 /// Return true if we might ever do TCO for calls with this calling convention.
2722 static bool mayTailCallThisCC(CallingConv::ID CC) {
2724 // C calling conventions:
2725 case CallingConv::C:
2726 case CallingConv::X86_64_Win64:
2727 case CallingConv::X86_64_SysV:
2728 // Callee pop conventions:
2729 case CallingConv::X86_ThisCall:
2730 case CallingConv::X86_StdCall:
2731 case CallingConv::X86_VectorCall:
2732 case CallingConv::X86_FastCall:
2735 return canGuaranteeTCO(CC);
2739 /// Return true if the function is being made into a tailcall target by
2740 /// changing its ABI.
2741 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2742 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2745 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2747 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2748 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2751 ImmutableCallSite CS(CI);
2752 CallingConv::ID CalleeCC = CS.getCallingConv();
2753 if (!mayTailCallThisCC(CalleeCC))
2760 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2761 const SmallVectorImpl<ISD::InputArg> &Ins,
2762 const SDLoc &dl, SelectionDAG &DAG,
2763 const CCValAssign &VA,
2764 MachineFrameInfo &MFI, unsigned i) const {
2765 // Create the nodes corresponding to a load from this parameter slot.
2766 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2767 bool AlwaysUseMutable = shouldGuaranteeTCO(
2768 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2769 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2771 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2773 // If value is passed by pointer we have address passed instead of the value
2774 // itself. No need to extend if the mask value and location share the same
2776 bool ExtendedInMem =
2777 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2778 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2780 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2781 ValVT = VA.getLocVT();
2783 ValVT = VA.getValVT();
2785 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2786 // taken by a return address.
2788 if (CallConv == CallingConv::X86_INTR) {
2789 // X86 interrupts may take one or two arguments.
2790 // On the stack there will be no return address as in regular call.
2791 // Offset of last argument need to be set to -4/-8 bytes.
2792 // Where offset of the first argument out of two, should be set to 0 bytes.
2793 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2794 if (Subtarget.is64Bit() && Ins.size() == 2) {
2795 // The stack pointer needs to be realigned for 64 bit handlers with error
2796 // code, so the argument offset changes by 8 bytes.
2801 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2802 // changed with more analysis.
2803 // In case of tail call optimization mark all arguments mutable. Since they
2804 // could be overwritten by lowering of arguments in case of a tail call.
2805 if (Flags.isByVal()) {
2806 unsigned Bytes = Flags.getByValSize();
2807 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2808 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2809 // Adjust SP offset of interrupt parameter.
2810 if (CallConv == CallingConv::X86_INTR) {
2811 MFI.setObjectOffset(FI, Offset);
2813 return DAG.getFrameIndex(FI, PtrVT);
2816 // This is an argument in memory. We might be able to perform copy elision.
2817 if (Flags.isCopyElisionCandidate()) {
2818 EVT ArgVT = Ins[i].ArgVT;
2820 if (Ins[i].PartOffset == 0) {
2821 // If this is a one-part value or the first part of a multi-part value,
2822 // create a stack object for the entire argument value type and return a
2823 // load from our portion of it. This assumes that if the first part of an
2824 // argument is in memory, the rest will also be in memory.
2825 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2826 /*Immutable=*/false);
2827 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2829 ValVT, dl, Chain, PartAddr,
2830 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2832 // This is not the first piece of an argument in memory. See if there is
2833 // already a fixed stack object including this offset. If so, assume it
2834 // was created by the PartOffset == 0 branch above and create a load from
2835 // the appropriate offset into it.
2836 int64_t PartBegin = VA.getLocMemOffset();
2837 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2838 int FI = MFI.getObjectIndexBegin();
2839 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2840 int64_t ObjBegin = MFI.getObjectOffset(FI);
2841 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2842 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2845 if (MFI.isFixedObjectIndex(FI)) {
2847 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2848 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2850 ValVT, dl, Chain, Addr,
2851 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2852 Ins[i].PartOffset));
2857 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2858 VA.getLocMemOffset(), isImmutable);
2860 // Set SExt or ZExt flag.
2861 if (VA.getLocInfo() == CCValAssign::ZExt) {
2862 MFI.setObjectZExt(FI, true);
2863 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2864 MFI.setObjectSExt(FI, true);
2867 // Adjust SP offset of interrupt parameter.
2868 if (CallConv == CallingConv::X86_INTR) {
2869 MFI.setObjectOffset(FI, Offset);
2872 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2873 SDValue Val = DAG.getLoad(
2874 ValVT, dl, Chain, FIN,
2875 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2876 return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
2880 // FIXME: Get this from tablegen.
2881 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2882 const X86Subtarget &Subtarget) {
2883 assert(Subtarget.is64Bit());
2885 if (Subtarget.isCallingConvWin64(CallConv)) {
2886 static const MCPhysReg GPR64ArgRegsWin64[] = {
2887 X86::RCX, X86::RDX, X86::R8, X86::R9
2889 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2892 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2893 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2895 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2898 // FIXME: Get this from tablegen.
2899 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2900 CallingConv::ID CallConv,
2901 const X86Subtarget &Subtarget) {
2902 assert(Subtarget.is64Bit());
2903 if (Subtarget.isCallingConvWin64(CallConv)) {
2904 // The XMM registers which might contain var arg parameters are shadowed
2905 // in their paired GPR. So we only need to save the GPR to their home
2907 // TODO: __vectorcall will change this.
2911 const Function *Fn = MF.getFunction();
2912 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2913 bool isSoftFloat = Subtarget.useSoftFloat();
2914 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2915 "SSE register cannot be used when SSE is disabled!");
2916 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2917 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2921 static const MCPhysReg XMMArgRegs64Bit[] = {
2922 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2923 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2925 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2929 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2930 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2931 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2932 return A.getValNo() < B.getValNo();
2937 SDValue X86TargetLowering::LowerFormalArguments(
2938 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2939 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2940 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2941 MachineFunction &MF = DAG.getMachineFunction();
2942 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2943 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2945 const Function *Fn = MF.getFunction();
2946 if (Fn->hasExternalLinkage() &&
2947 Subtarget.isTargetCygMing() &&
2948 Fn->getName() == "main")
2949 FuncInfo->setForceFramePointer(true);
2951 MachineFrameInfo &MFI = MF.getFrameInfo();
2952 bool Is64Bit = Subtarget.is64Bit();
2953 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2956 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2957 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2959 if (CallConv == CallingConv::X86_INTR) {
2960 bool isLegal = Ins.size() == 1 ||
2961 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2962 (!Is64Bit && Ins[1].VT == MVT::i32)));
2964 report_fatal_error("X86 interrupts may take one or two arguments");
2967 // Assign locations to all of the incoming arguments.
2968 SmallVector<CCValAssign, 16> ArgLocs;
2969 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2971 // Allocate shadow area for Win64.
2973 CCInfo.AllocateStack(32, 8);
2975 CCInfo.AnalyzeArguments(Ins, CC_X86);
2977 // In vectorcall calling convention a second pass is required for the HVA
2979 if (CallingConv::X86_VectorCall == CallConv) {
2980 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2983 // The next loop assumes that the locations are in the same order of the
2985 assert(isSortedByValueNo(ArgLocs) &&
2986 "Argument Location list must be sorted before lowering");
2989 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2991 assert(InsIndex < Ins.size() && "Invalid Ins index");
2992 CCValAssign &VA = ArgLocs[I];
2994 if (VA.isRegLoc()) {
2995 EVT RegVT = VA.getLocVT();
2996 if (VA.needsCustom()) {
2998 VA.getValVT() == MVT::v64i1 &&
2999 "Currently the only custom case is when we split v64i1 to 2 regs");
3001 // v64i1 values, in regcall calling convention, that are
3002 // compiled to 32 bit arch, are split up into two registers.
3004 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3006 const TargetRegisterClass *RC;
3007 if (RegVT == MVT::i32)
3008 RC = &X86::GR32RegClass;
3009 else if (Is64Bit && RegVT == MVT::i64)
3010 RC = &X86::GR64RegClass;
3011 else if (RegVT == MVT::f32)
3012 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3013 else if (RegVT == MVT::f64)
3014 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3015 else if (RegVT == MVT::f80)
3016 RC = &X86::RFP80RegClass;
3017 else if (RegVT == MVT::f128)
3018 RC = &X86::FR128RegClass;
3019 else if (RegVT.is512BitVector())
3020 RC = &X86::VR512RegClass;
3021 else if (RegVT.is256BitVector())
3022 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3023 else if (RegVT.is128BitVector())
3024 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3025 else if (RegVT == MVT::x86mmx)
3026 RC = &X86::VR64RegClass;
3027 else if (RegVT == MVT::i1)
3028 RC = &X86::VK1RegClass;
3029 else if (RegVT == MVT::v8i1)
3030 RC = &X86::VK8RegClass;
3031 else if (RegVT == MVT::v16i1)
3032 RC = &X86::VK16RegClass;
3033 else if (RegVT == MVT::v32i1)
3034 RC = &X86::VK32RegClass;
3035 else if (RegVT == MVT::v64i1)
3036 RC = &X86::VK64RegClass;
3038 llvm_unreachable("Unknown argument type!");
3040 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3041 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3044 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3045 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3047 if (VA.getLocInfo() == CCValAssign::SExt)
3048 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3049 DAG.getValueType(VA.getValVT()));
3050 else if (VA.getLocInfo() == CCValAssign::ZExt)
3051 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3052 DAG.getValueType(VA.getValVT()));
3053 else if (VA.getLocInfo() == CCValAssign::BCvt)
3054 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3056 if (VA.isExtInLoc()) {
3057 // Handle MMX values passed in XMM regs.
3058 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3059 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3060 else if (VA.getValVT().isVector() &&
3061 VA.getValVT().getScalarType() == MVT::i1 &&
3062 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3063 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3064 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3065 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3067 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3070 assert(VA.isMemLoc());
3072 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3075 // If value is passed via pointer - do a load.
3076 if (VA.getLocInfo() == CCValAssign::Indirect)
3078 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3080 InVals.push_back(ArgValue);
3083 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3084 // Swift calling convention does not require we copy the sret argument
3085 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3086 if (CallConv == CallingConv::Swift)
3089 // All x86 ABIs require that for returning structs by value we copy the
3090 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3091 // the argument into a virtual register so that we can access it from the
3093 if (Ins[I].Flags.isSRet()) {
3094 unsigned Reg = FuncInfo->getSRetReturnReg();
3096 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3097 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3098 FuncInfo->setSRetReturnReg(Reg);
3100 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3101 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3106 unsigned StackSize = CCInfo.getNextStackOffset();
3107 // Align stack specially for tail calls.
3108 if (shouldGuaranteeTCO(CallConv,
3109 MF.getTarget().Options.GuaranteedTailCallOpt))
3110 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3112 // If the function takes variable number of arguments, make a frame index for
3113 // the start of the first vararg value... for expansion of llvm.va_start. We
3114 // can skip this if there are no va_start calls.
3115 if (MFI.hasVAStart() &&
3116 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3117 CallConv != CallingConv::X86_ThisCall))) {
3118 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3121 // Figure out if XMM registers are in use.
3122 assert(!(Subtarget.useSoftFloat() &&
3123 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3124 "SSE register cannot be used when SSE is disabled!");
3126 // 64-bit calling conventions support varargs and register parameters, so we
3127 // have to do extra work to spill them in the prologue.
3128 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3129 // Find the first unallocated argument registers.
3130 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3131 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3132 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3133 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3134 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3135 "SSE register cannot be used when SSE is disabled!");
3137 // Gather all the live in physical registers.
3138 SmallVector<SDValue, 6> LiveGPRs;
3139 SmallVector<SDValue, 8> LiveXMMRegs;
3141 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3142 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3144 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3146 if (!ArgXMMs.empty()) {
3147 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3148 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3149 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3150 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3151 LiveXMMRegs.push_back(
3152 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3157 // Get to the caller-allocated home save location. Add 8 to account
3158 // for the return address.
3159 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3160 FuncInfo->setRegSaveFrameIndex(
3161 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3162 // Fixup to set vararg frame on shadow area (4 x i64).
3164 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3166 // For X86-64, if there are vararg parameters that are passed via
3167 // registers, then we must store them to their spots on the stack so
3168 // they may be loaded by dereferencing the result of va_next.
3169 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3170 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3171 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3172 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3175 // Store the integer parameter registers.
3176 SmallVector<SDValue, 8> MemOps;
3177 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3178 getPointerTy(DAG.getDataLayout()));
3179 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3180 for (SDValue Val : LiveGPRs) {
3181 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3182 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3184 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3185 MachinePointerInfo::getFixedStack(
3186 DAG.getMachineFunction(),
3187 FuncInfo->getRegSaveFrameIndex(), Offset));
3188 MemOps.push_back(Store);
3192 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3193 // Now store the XMM (fp + vector) parameter registers.
3194 SmallVector<SDValue, 12> SaveXMMOps;
3195 SaveXMMOps.push_back(Chain);
3196 SaveXMMOps.push_back(ALVal);
3197 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3198 FuncInfo->getRegSaveFrameIndex(), dl));
3199 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3200 FuncInfo->getVarArgsFPOffset(), dl));
3201 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3203 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3204 MVT::Other, SaveXMMOps));
3207 if (!MemOps.empty())
3208 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3211 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3212 // Find the largest legal vector type.
3213 MVT VecVT = MVT::Other;
3214 // FIXME: Only some x86_32 calling conventions support AVX512.
3215 if (Subtarget.hasAVX512() &&
3216 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3217 CallConv == CallingConv::Intel_OCL_BI)))
3218 VecVT = MVT::v16f32;
3219 else if (Subtarget.hasAVX())
3221 else if (Subtarget.hasSSE2())
3224 // We forward some GPRs and some vector types.
3225 SmallVector<MVT, 2> RegParmTypes;
3226 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3227 RegParmTypes.push_back(IntVT);
3228 if (VecVT != MVT::Other)
3229 RegParmTypes.push_back(VecVT);
3231 // Compute the set of forwarded registers. The rest are scratch.
3232 SmallVectorImpl<ForwardedRegister> &Forwards =
3233 FuncInfo->getForwardedMustTailRegParms();
3234 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3236 // Conservatively forward AL on x86_64, since it might be used for varargs.
3237 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3238 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3239 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3242 // Copy all forwards from physical to virtual registers.
3243 for (ForwardedRegister &F : Forwards) {
3244 // FIXME: Can we use a less constrained schedule?
3245 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3246 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3247 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3251 // Some CCs need callee pop.
3252 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3253 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3254 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3255 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3256 // X86 interrupts must pop the error code (and the alignment padding) if
3258 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3260 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3261 // If this is an sret function, the return should pop the hidden pointer.
3262 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3263 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3264 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3265 FuncInfo->setBytesToPopOnReturn(4);
3269 // RegSaveFrameIndex is X86-64 only.
3270 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3271 if (CallConv == CallingConv::X86_FastCall ||
3272 CallConv == CallingConv::X86_ThisCall)
3273 // fastcc functions can't have varargs.
3274 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3277 FuncInfo->setArgumentStackSize(StackSize);
3279 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3280 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3281 if (Personality == EHPersonality::CoreCLR) {
3283 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3284 // that we'd prefer this slot be allocated towards the bottom of the frame
3285 // (i.e. near the stack pointer after allocating the frame). Every
3286 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3287 // offset from the bottom of this and each funclet's frame must be the
3288 // same, so the size of funclets' (mostly empty) frames is dictated by
3289 // how far this slot is from the bottom (since they allocate just enough
3290 // space to accommodate holding this slot at the correct offset).
3291 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3292 EHInfo->PSPSymFrameIdx = PSPSymFI;
3296 if (CallConv == CallingConv::X86_RegCall) {
3297 const MachineRegisterInfo &MRI = MF.getRegInfo();
3298 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3299 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3305 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3306 SDValue Arg, const SDLoc &dl,
3308 const CCValAssign &VA,
3309 ISD::ArgFlagsTy Flags) const {
3310 unsigned LocMemOffset = VA.getLocMemOffset();
3311 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3312 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3314 if (Flags.isByVal())
3315 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3317 return DAG.getStore(
3318 Chain, dl, Arg, PtrOff,
3319 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3322 /// Emit a load of return address if tail call
3323 /// optimization is performed and it is required.
3324 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3325 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3326 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3327 // Adjust the Return address stack slot.
3328 EVT VT = getPointerTy(DAG.getDataLayout());
3329 OutRetAddr = getReturnAddressFrameIndex(DAG);
3331 // Load the "old" Return address.
3332 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3333 return SDValue(OutRetAddr.getNode(), 1);
3336 /// Emit a store of the return address if tail call
3337 /// optimization is performed and it is required (FPDiff!=0).
3338 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3339 SDValue Chain, SDValue RetAddrFrIdx,
3340 EVT PtrVT, unsigned SlotSize,
3341 int FPDiff, const SDLoc &dl) {
3342 // Store the return address to the appropriate stack slot.
3343 if (!FPDiff) return Chain;
3344 // Calculate the new stack slot for the return address.
3345 int NewReturnAddrFI =
3346 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3348 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3349 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3350 MachinePointerInfo::getFixedStack(
3351 DAG.getMachineFunction(), NewReturnAddrFI));
3355 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3356 /// operation of specified width.
3357 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3359 unsigned NumElems = VT.getVectorNumElements();
3360 SmallVector<int, 8> Mask;
3361 Mask.push_back(NumElems);
3362 for (unsigned i = 1; i != NumElems; ++i)
3364 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3368 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3369 SmallVectorImpl<SDValue> &InVals) const {
3370 SelectionDAG &DAG = CLI.DAG;
3372 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3373 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3374 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3375 SDValue Chain = CLI.Chain;
3376 SDValue Callee = CLI.Callee;
3377 CallingConv::ID CallConv = CLI.CallConv;
3378 bool &isTailCall = CLI.IsTailCall;
3379 bool isVarArg = CLI.IsVarArg;
3381 MachineFunction &MF = DAG.getMachineFunction();
3382 bool Is64Bit = Subtarget.is64Bit();
3383 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3384 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3385 bool IsSibcall = false;
3386 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3387 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3389 if (CallConv == CallingConv::X86_INTR)
3390 report_fatal_error("X86 interrupts may not be called directly");
3392 if (Attr.getValueAsString() == "true")
3395 if (Subtarget.isPICStyleGOT() &&
3396 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3397 // If we are using a GOT, disable tail calls to external symbols with
3398 // default visibility. Tail calling such a symbol requires using a GOT
3399 // relocation, which forces early binding of the symbol. This breaks code
3400 // that require lazy function symbol resolution. Using musttail or
3401 // GuaranteedTailCallOpt will override this.
3402 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3403 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3404 G->getGlobal()->hasDefaultVisibility()))
3408 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3410 // Force this to be a tail call. The verifier rules are enough to ensure
3411 // that we can lower this successfully without moving the return address
3414 } else if (isTailCall) {
3415 // Check if it's really possible to do a tail call.
3416 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3417 isVarArg, SR != NotStructReturn,
3418 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3419 Outs, OutVals, Ins, DAG);
3421 // Sibcalls are automatically detected tailcalls which do not require
3423 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3430 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3431 "Var args not supported with calling convention fastcc, ghc or hipe");
3433 // Analyze operands of the call, assigning locations to each operand.
3434 SmallVector<CCValAssign, 16> ArgLocs;
3435 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3437 // Allocate shadow area for Win64.
3439 CCInfo.AllocateStack(32, 8);
3441 CCInfo.AnalyzeArguments(Outs, CC_X86);
3443 // In vectorcall calling convention a second pass is required for the HVA
3445 if (CallingConv::X86_VectorCall == CallConv) {
3446 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3449 // Get a count of how many bytes are to be pushed on the stack.
3450 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3452 // This is a sibcall. The memory operands are available in caller's
3453 // own caller's stack.
3455 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3456 canGuaranteeTCO(CallConv))
3457 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3460 if (isTailCall && !IsSibcall && !IsMustTail) {
3461 // Lower arguments at fp - stackoffset + fpdiff.
3462 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3464 FPDiff = NumBytesCallerPushed - NumBytes;
3466 // Set the delta of movement of the returnaddr stackslot.
3467 // But only set if delta is greater than previous delta.
3468 if (FPDiff < X86Info->getTCReturnAddrDelta())
3469 X86Info->setTCReturnAddrDelta(FPDiff);
3472 unsigned NumBytesToPush = NumBytes;
3473 unsigned NumBytesToPop = NumBytes;
3475 // If we have an inalloca argument, all stack space has already been allocated
3476 // for us and be right at the top of the stack. We don't support multiple
3477 // arguments passed in memory when using inalloca.
3478 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3480 if (!ArgLocs.back().isMemLoc())
3481 report_fatal_error("cannot use inalloca attribute on a register "
3483 if (ArgLocs.back().getLocMemOffset() != 0)
3484 report_fatal_error("any parameter with the inalloca attribute must be "
3485 "the only memory argument");
3489 Chain = DAG.getCALLSEQ_START(
3490 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3492 SDValue RetAddrFrIdx;
3493 // Load return address for tail calls.
3494 if (isTailCall && FPDiff)
3495 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3496 Is64Bit, FPDiff, dl);
3498 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3499 SmallVector<SDValue, 8> MemOpChains;
3502 // The next loop assumes that the locations are in the same order of the
3504 assert(isSortedByValueNo(ArgLocs) &&
3505 "Argument Location list must be sorted before lowering");
3507 // Walk the register/memloc assignments, inserting copies/loads. In the case
3508 // of tail call optimization arguments are handle later.
3509 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3510 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3512 assert(OutIndex < Outs.size() && "Invalid Out index");
3513 // Skip inalloca arguments, they have already been written.
3514 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3515 if (Flags.isInAlloca())
3518 CCValAssign &VA = ArgLocs[I];
3519 EVT RegVT = VA.getLocVT();
3520 SDValue Arg = OutVals[OutIndex];
3521 bool isByVal = Flags.isByVal();
3523 // Promote the value if needed.
3524 switch (VA.getLocInfo()) {
3525 default: llvm_unreachable("Unknown loc info!");
3526 case CCValAssign::Full: break;
3527 case CCValAssign::SExt:
3528 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3530 case CCValAssign::ZExt:
3531 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3533 case CCValAssign::AExt:
3534 if (Arg.getValueType().isVector() &&
3535 Arg.getValueType().getVectorElementType() == MVT::i1)
3536 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3537 else if (RegVT.is128BitVector()) {
3538 // Special case: passing MMX values in XMM registers.
3539 Arg = DAG.getBitcast(MVT::i64, Arg);
3540 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3541 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3543 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3545 case CCValAssign::BCvt:
3546 Arg = DAG.getBitcast(RegVT, Arg);
3548 case CCValAssign::Indirect: {
3549 // Store the argument.
3550 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3551 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3552 Chain = DAG.getStore(
3553 Chain, dl, Arg, SpillSlot,
3554 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3560 if (VA.needsCustom()) {
3561 assert(VA.getValVT() == MVT::v64i1 &&
3562 "Currently the only custom case is when we split v64i1 to 2 regs");
3563 // Split v64i1 value into two registers
3564 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3566 } else if (VA.isRegLoc()) {
3567 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3568 if (isVarArg && IsWin64) {
3569 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3570 // shadow reg if callee is a varargs function.
3571 unsigned ShadowReg = 0;
3572 switch (VA.getLocReg()) {
3573 case X86::XMM0: ShadowReg = X86::RCX; break;
3574 case X86::XMM1: ShadowReg = X86::RDX; break;
3575 case X86::XMM2: ShadowReg = X86::R8; break;
3576 case X86::XMM3: ShadowReg = X86::R9; break;
3579 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3581 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3582 assert(VA.isMemLoc());
3583 if (!StackPtr.getNode())
3584 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3585 getPointerTy(DAG.getDataLayout()));
3586 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3587 dl, DAG, VA, Flags));
3591 if (!MemOpChains.empty())
3592 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3594 if (Subtarget.isPICStyleGOT()) {
3595 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3598 RegsToPass.push_back(std::make_pair(
3599 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3600 getPointerTy(DAG.getDataLayout()))));
3602 // If we are tail calling and generating PIC/GOT style code load the
3603 // address of the callee into ECX. The value in ecx is used as target of
3604 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3605 // for tail calls on PIC/GOT architectures. Normally we would just put the
3606 // address of GOT into ebx and then call target@PLT. But for tail calls
3607 // ebx would be restored (since ebx is callee saved) before jumping to the
3610 // Note: The actual moving to ECX is done further down.
3611 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3612 if (G && !G->getGlobal()->hasLocalLinkage() &&
3613 G->getGlobal()->hasDefaultVisibility())
3614 Callee = LowerGlobalAddress(Callee, DAG);
3615 else if (isa<ExternalSymbolSDNode>(Callee))
3616 Callee = LowerExternalSymbol(Callee, DAG);
3620 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3621 // From AMD64 ABI document:
3622 // For calls that may call functions that use varargs or stdargs
3623 // (prototype-less calls or calls to functions containing ellipsis (...) in
3624 // the declaration) %al is used as hidden argument to specify the number
3625 // of SSE registers used. The contents of %al do not need to match exactly
3626 // the number of registers, but must be an ubound on the number of SSE
3627 // registers used and is in the range 0 - 8 inclusive.
3629 // Count the number of XMM registers allocated.
3630 static const MCPhysReg XMMArgRegs[] = {
3631 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3632 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3634 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3635 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3636 && "SSE registers cannot be used when SSE is disabled");
3638 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3639 DAG.getConstant(NumXMMRegs, dl,
3643 if (isVarArg && IsMustTail) {
3644 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3645 for (const auto &F : Forwards) {
3646 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3647 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3651 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3652 // don't need this because the eligibility check rejects calls that require
3653 // shuffling arguments passed in memory.
3654 if (!IsSibcall && isTailCall) {
3655 // Force all the incoming stack arguments to be loaded from the stack
3656 // before any new outgoing arguments are stored to the stack, because the
3657 // outgoing stack slots may alias the incoming argument stack slots, and
3658 // the alias isn't otherwise explicit. This is slightly more conservative
3659 // than necessary, because it means that each store effectively depends
3660 // on every argument instead of just those arguments it would clobber.
3661 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3663 SmallVector<SDValue, 8> MemOpChains2;
3666 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3668 CCValAssign &VA = ArgLocs[I];
3670 if (VA.isRegLoc()) {
3671 if (VA.needsCustom()) {
3672 assert((CallConv == CallingConv::X86_RegCall) &&
3673 "Expecting custom case only in regcall calling convention");
3674 // This means that we are in special case where one argument was
3675 // passed through two register locations - Skip the next location
3682 assert(VA.isMemLoc());
3683 SDValue Arg = OutVals[OutsIndex];
3684 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3685 // Skip inalloca arguments. They don't require any work.
3686 if (Flags.isInAlloca())
3688 // Create frame index.
3689 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3690 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3691 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3692 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3694 if (Flags.isByVal()) {
3695 // Copy relative to framepointer.
3696 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3697 if (!StackPtr.getNode())
3698 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3699 getPointerTy(DAG.getDataLayout()));
3700 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3703 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3707 // Store relative to framepointer.
3708 MemOpChains2.push_back(DAG.getStore(
3709 ArgChain, dl, Arg, FIN,
3710 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3714 if (!MemOpChains2.empty())
3715 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3717 // Store the return address to the appropriate stack slot.
3718 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3719 getPointerTy(DAG.getDataLayout()),
3720 RegInfo->getSlotSize(), FPDiff, dl);
3723 // Build a sequence of copy-to-reg nodes chained together with token chain
3724 // and flag operands which copy the outgoing args into registers.
3726 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3727 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3728 RegsToPass[i].second, InFlag);
3729 InFlag = Chain.getValue(1);
3732 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3733 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3734 // In the 64-bit large code model, we have to make all calls
3735 // through a register, since the call instruction's 32-bit
3736 // pc-relative offset may not be large enough to hold the whole
3738 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3739 // If the callee is a GlobalAddress node (quite common, every direct call
3740 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3742 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3744 // We should use extra load for direct calls to dllimported functions in
3746 const GlobalValue *GV = G->getGlobal();
3747 if (!GV->hasDLLImportStorageClass()) {
3748 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3750 Callee = DAG.getTargetGlobalAddress(
3751 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3753 if (OpFlags == X86II::MO_GOTPCREL) {
3755 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3756 getPointerTy(DAG.getDataLayout()), Callee);
3757 // Add extra indirection
3758 Callee = DAG.getLoad(
3759 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3760 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3763 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3764 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3765 unsigned char OpFlags =
3766 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3768 Callee = DAG.getTargetExternalSymbol(
3769 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3770 } else if (Subtarget.isTarget64BitILP32() &&
3771 Callee->getValueType(0) == MVT::i32) {
3772 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3773 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3776 // Returns a chain & a flag for retval copy to use.
3777 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3778 SmallVector<SDValue, 8> Ops;
3780 if (!IsSibcall && isTailCall) {
3781 Chain = DAG.getCALLSEQ_END(Chain,
3782 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3783 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3784 InFlag = Chain.getValue(1);
3787 Ops.push_back(Chain);
3788 Ops.push_back(Callee);
3791 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3793 // Add argument registers to the end of the list so that they are known live
3795 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3796 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3797 RegsToPass[i].second.getValueType()));
3799 // Add a register mask operand representing the call-preserved registers.
3800 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3801 assert(Mask && "Missing call preserved mask for calling convention");
3803 // If this is an invoke in a 32-bit function using a funclet-based
3804 // personality, assume the function clobbers all registers. If an exception
3805 // is thrown, the runtime will not restore CSRs.
3806 // FIXME: Model this more precisely so that we can register allocate across
3807 // the normal edge and spill and fill across the exceptional edge.
3808 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3809 const Function *CallerFn = MF.getFunction();
3810 EHPersonality Pers =
3811 CallerFn->hasPersonalityFn()
3812 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3813 : EHPersonality::Unknown;
3814 if (isFuncletEHPersonality(Pers))
3815 Mask = RegInfo->getNoPreservedMask();
3818 // Define a new register mask from the existing mask.
3819 uint32_t *RegMask = nullptr;
3821 // In some calling conventions we need to remove the used physical registers
3822 // from the reg mask.
3823 if (CallConv == CallingConv::X86_RegCall) {
3824 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3826 // Allocate a new Reg Mask and copy Mask.
3827 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3828 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3829 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3831 // Make sure all sub registers of the argument registers are reset
3833 for (auto const &RegPair : RegsToPass)
3834 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3835 SubRegs.isValid(); ++SubRegs)
3836 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3838 // Create the RegMask Operand according to our updated mask.
3839 Ops.push_back(DAG.getRegisterMask(RegMask));
3841 // Create the RegMask Operand according to the static mask.
3842 Ops.push_back(DAG.getRegisterMask(Mask));
3845 if (InFlag.getNode())
3846 Ops.push_back(InFlag);
3850 //// If this is the first return lowered for this function, add the regs
3851 //// to the liveout set for the function.
3852 // This isn't right, although it's probably harmless on x86; liveouts
3853 // should be computed from returns not tail calls. Consider a void
3854 // function making a tail call to a function returning int.
3855 MF.getFrameInfo().setHasTailCall();
3856 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3859 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3860 InFlag = Chain.getValue(1);
3862 // Create the CALLSEQ_END node.
3863 unsigned NumBytesForCalleeToPop;
3864 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3865 DAG.getTarget().Options.GuaranteedTailCallOpt))
3866 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3867 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3868 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3869 SR == StackStructReturn)
3870 // If this is a call to a struct-return function, the callee
3871 // pops the hidden struct pointer, so we have to push it back.
3872 // This is common for Darwin/X86, Linux & Mingw32 targets.
3873 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3874 NumBytesForCalleeToPop = 4;
3876 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3878 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3879 // No need to reset the stack after the call if the call doesn't return. To
3880 // make the MI verify, we'll pretend the callee does it for us.
3881 NumBytesForCalleeToPop = NumBytes;
3884 // Returns a flag for retval copy to use.
3886 Chain = DAG.getCALLSEQ_END(Chain,
3887 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3888 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3891 InFlag = Chain.getValue(1);
3894 // Handle result values, copying them out of physregs into vregs that we
3896 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3900 //===----------------------------------------------------------------------===//
3901 // Fast Calling Convention (tail call) implementation
3902 //===----------------------------------------------------------------------===//
3904 // Like std call, callee cleans arguments, convention except that ECX is
3905 // reserved for storing the tail called function address. Only 2 registers are
3906 // free for argument passing (inreg). Tail call optimization is performed
3908 // * tailcallopt is enabled
3909 // * caller/callee are fastcc
3910 // On X86_64 architecture with GOT-style position independent code only local
3911 // (within module) calls are supported at the moment.
3912 // To keep the stack aligned according to platform abi the function
3913 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3914 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3915 // If a tail called function callee has more arguments than the caller the
3916 // caller needs to make sure that there is room to move the RETADDR to. This is
3917 // achieved by reserving an area the size of the argument delta right after the
3918 // original RETADDR, but before the saved framepointer or the spilled registers
3919 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3931 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3934 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3935 SelectionDAG& DAG) const {
3936 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3937 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3938 unsigned StackAlignment = TFI.getStackAlignment();
3939 uint64_t AlignMask = StackAlignment - 1;
3940 int64_t Offset = StackSize;
3941 unsigned SlotSize = RegInfo->getSlotSize();
3942 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3943 // Number smaller than 12 so just add the difference.
3944 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3946 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3947 Offset = ((~AlignMask) & Offset) + StackAlignment +
3948 (StackAlignment-SlotSize);
3953 /// Return true if the given stack call argument is already available in the
3954 /// same position (relatively) of the caller's incoming argument stack.
3956 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3957 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3958 const X86InstrInfo *TII, const CCValAssign &VA) {
3959 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3962 // Look through nodes that don't alter the bits of the incoming value.
3963 unsigned Op = Arg.getOpcode();
3964 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3965 Arg = Arg.getOperand(0);
3968 if (Op == ISD::TRUNCATE) {
3969 const SDValue &TruncInput = Arg.getOperand(0);
3970 if (TruncInput.getOpcode() == ISD::AssertZext &&
3971 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3972 Arg.getValueType()) {
3973 Arg = TruncInput.getOperand(0);
3981 if (Arg.getOpcode() == ISD::CopyFromReg) {
3982 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3983 if (!TargetRegisterInfo::isVirtualRegister(VR))
3985 MachineInstr *Def = MRI->getVRegDef(VR);
3988 if (!Flags.isByVal()) {
3989 if (!TII->isLoadFromStackSlot(*Def, FI))
3992 unsigned Opcode = Def->getOpcode();
3993 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3994 Opcode == X86::LEA64_32r) &&
3995 Def->getOperand(1).isFI()) {
3996 FI = Def->getOperand(1).getIndex();
3997 Bytes = Flags.getByValSize();
4001 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4002 if (Flags.isByVal())
4003 // ByVal argument is passed in as a pointer but it's now being
4004 // dereferenced. e.g.
4005 // define @foo(%struct.X* %A) {
4006 // tail call @bar(%struct.X* byval %A)
4009 SDValue Ptr = Ld->getBasePtr();
4010 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4013 FI = FINode->getIndex();
4014 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4015 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4016 FI = FINode->getIndex();
4017 Bytes = Flags.getByValSize();
4021 assert(FI != INT_MAX);
4022 if (!MFI.isFixedObjectIndex(FI))
4025 if (Offset != MFI.getObjectOffset(FI))
4028 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4029 // If the argument location is wider than the argument type, check that any
4030 // extension flags match.
4031 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4032 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4037 return Bytes == MFI.getObjectSize(FI);
4040 /// Check whether the call is eligible for tail call optimization. Targets
4041 /// that want to do tail call optimization should implement this function.
4042 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4043 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4044 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4045 const SmallVectorImpl<ISD::OutputArg> &Outs,
4046 const SmallVectorImpl<SDValue> &OutVals,
4047 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4048 if (!mayTailCallThisCC(CalleeCC))
4051 // If -tailcallopt is specified, make fastcc functions tail-callable.
4052 MachineFunction &MF = DAG.getMachineFunction();
4053 const Function *CallerF = MF.getFunction();
4055 // If the function return type is x86_fp80 and the callee return type is not,
4056 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4057 // perform a tailcall optimization here.
4058 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4061 CallingConv::ID CallerCC = CallerF->getCallingConv();
4062 bool CCMatch = CallerCC == CalleeCC;
4063 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4064 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4066 // Win64 functions have extra shadow space for argument homing. Don't do the
4067 // sibcall if the caller and callee have mismatched expectations for this
4069 if (IsCalleeWin64 != IsCallerWin64)
4072 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4073 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4078 // Look for obvious safe cases to perform tail call optimization that do not
4079 // require ABI changes. This is what gcc calls sibcall.
4081 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4082 // emit a special epilogue.
4083 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4084 if (RegInfo->needsStackRealignment(MF))
4087 // Also avoid sibcall optimization if either caller or callee uses struct
4088 // return semantics.
4089 if (isCalleeStructRet || isCallerStructRet)
4092 // Do not sibcall optimize vararg calls unless all arguments are passed via
4094 LLVMContext &C = *DAG.getContext();
4095 if (isVarArg && !Outs.empty()) {
4096 // Optimizing for varargs on Win64 is unlikely to be safe without
4097 // additional testing.
4098 if (IsCalleeWin64 || IsCallerWin64)
4101 SmallVector<CCValAssign, 16> ArgLocs;
4102 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4104 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4105 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4106 if (!ArgLocs[i].isRegLoc())
4110 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4111 // stack. Therefore, if it's not used by the call it is not safe to optimize
4112 // this into a sibcall.
4113 bool Unused = false;
4114 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4121 SmallVector<CCValAssign, 16> RVLocs;
4122 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4123 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4124 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4125 CCValAssign &VA = RVLocs[i];
4126 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4131 // Check that the call results are passed in the same way.
4132 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4133 RetCC_X86, RetCC_X86))
4135 // The callee has to preserve all registers the caller needs to preserve.
4136 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4137 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4139 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4140 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4144 unsigned StackArgsSize = 0;
4146 // If the callee takes no arguments then go on to check the results of the
4148 if (!Outs.empty()) {
4149 // Check if stack adjustment is needed. For now, do not do this if any
4150 // argument is passed on the stack.
4151 SmallVector<CCValAssign, 16> ArgLocs;
4152 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4154 // Allocate shadow area for Win64
4156 CCInfo.AllocateStack(32, 8);
4158 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4159 StackArgsSize = CCInfo.getNextStackOffset();
4161 if (CCInfo.getNextStackOffset()) {
4162 // Check if the arguments are already laid out in the right way as
4163 // the caller's fixed stack objects.
4164 MachineFrameInfo &MFI = MF.getFrameInfo();
4165 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4166 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4167 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4168 CCValAssign &VA = ArgLocs[i];
4169 SDValue Arg = OutVals[i];
4170 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4171 if (VA.getLocInfo() == CCValAssign::Indirect)
4173 if (!VA.isRegLoc()) {
4174 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4181 bool PositionIndependent = isPositionIndependent();
4182 // If the tailcall address may be in a register, then make sure it's
4183 // possible to register allocate for it. In 32-bit, the call address can
4184 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4185 // callee-saved registers are restored. These happen to be the same
4186 // registers used to pass 'inreg' arguments so watch out for those.
4187 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4188 !isa<ExternalSymbolSDNode>(Callee)) ||
4189 PositionIndependent)) {
4190 unsigned NumInRegs = 0;
4191 // In PIC we need an extra register to formulate the address computation
4193 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4195 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4196 CCValAssign &VA = ArgLocs[i];
4199 unsigned Reg = VA.getLocReg();
4202 case X86::EAX: case X86::EDX: case X86::ECX:
4203 if (++NumInRegs == MaxInRegs)
4210 const MachineRegisterInfo &MRI = MF.getRegInfo();
4211 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4215 bool CalleeWillPop =
4216 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4217 MF.getTarget().Options.GuaranteedTailCallOpt);
4219 if (unsigned BytesToPop =
4220 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4221 // If we have bytes to pop, the callee must pop them.
4222 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4223 if (!CalleePopMatches)
4225 } else if (CalleeWillPop && StackArgsSize > 0) {
4226 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4234 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4235 const TargetLibraryInfo *libInfo) const {
4236 return X86::createFastISel(funcInfo, libInfo);
4239 //===----------------------------------------------------------------------===//
4240 // Other Lowering Hooks
4241 //===----------------------------------------------------------------------===//
4243 static bool MayFoldLoad(SDValue Op) {
4244 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4247 static bool MayFoldIntoStore(SDValue Op) {
4248 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4251 static bool MayFoldIntoZeroExtend(SDValue Op) {
4252 if (Op.hasOneUse()) {
4253 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4254 return (ISD::ZERO_EXTEND == Opcode);
4259 static bool isTargetShuffle(unsigned Opcode) {
4261 default: return false;
4262 case X86ISD::BLENDI:
4263 case X86ISD::PSHUFB:
4264 case X86ISD::PSHUFD:
4265 case X86ISD::PSHUFHW:
4266 case X86ISD::PSHUFLW:
4268 case X86ISD::INSERTPS:
4269 case X86ISD::PALIGNR:
4270 case X86ISD::VSHLDQ:
4271 case X86ISD::VSRLDQ:
4272 case X86ISD::MOVLHPS:
4273 case X86ISD::MOVLHPD:
4274 case X86ISD::MOVHLPS:
4275 case X86ISD::MOVLPS:
4276 case X86ISD::MOVLPD:
4277 case X86ISD::MOVSHDUP:
4278 case X86ISD::MOVSLDUP:
4279 case X86ISD::MOVDDUP:
4282 case X86ISD::UNPCKL:
4283 case X86ISD::UNPCKH:
4284 case X86ISD::VBROADCAST:
4285 case X86ISD::VPERMILPI:
4286 case X86ISD::VPERMILPV:
4287 case X86ISD::VPERM2X128:
4288 case X86ISD::VPERMIL2:
4289 case X86ISD::VPERMI:
4290 case X86ISD::VPPERM:
4291 case X86ISD::VPERMV:
4292 case X86ISD::VPERMV3:
4293 case X86ISD::VPERMIV3:
4294 case X86ISD::VZEXT_MOVL:
4299 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4301 default: return false;
4303 case X86ISD::PSHUFB:
4304 case X86ISD::VPERMILPV:
4305 case X86ISD::VPERMIL2:
4306 case X86ISD::VPPERM:
4307 case X86ISD::VPERMV:
4308 case X86ISD::VPERMV3:
4309 case X86ISD::VPERMIV3:
4311 // 'Faux' Target Shuffles.
4318 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4319 MachineFunction &MF = DAG.getMachineFunction();
4320 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4321 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4322 int ReturnAddrIndex = FuncInfo->getRAIndex();
4324 if (ReturnAddrIndex == 0) {
4325 // Set up a frame object for the return address.
4326 unsigned SlotSize = RegInfo->getSlotSize();
4327 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4330 FuncInfo->setRAIndex(ReturnAddrIndex);
4333 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4336 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4337 bool hasSymbolicDisplacement) {
4338 // Offset should fit into 32 bit immediate field.
4339 if (!isInt<32>(Offset))
4342 // If we don't have a symbolic displacement - we don't have any extra
4344 if (!hasSymbolicDisplacement)
4347 // FIXME: Some tweaks might be needed for medium code model.
4348 if (M != CodeModel::Small && M != CodeModel::Kernel)
4351 // For small code model we assume that latest object is 16MB before end of 31
4352 // bits boundary. We may also accept pretty large negative constants knowing
4353 // that all objects are in the positive half of address space.
4354 if (M == CodeModel::Small && Offset < 16*1024*1024)
4357 // For kernel code model we know that all object resist in the negative half
4358 // of 32bits address space. We may not accept negative offsets, since they may
4359 // be just off and we may accept pretty large positive ones.
4360 if (M == CodeModel::Kernel && Offset >= 0)
4366 /// Determines whether the callee is required to pop its own arguments.
4367 /// Callee pop is necessary to support tail calls.
4368 bool X86::isCalleePop(CallingConv::ID CallingConv,
4369 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4370 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4371 // can guarantee TCO.
4372 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4375 switch (CallingConv) {
4378 case CallingConv::X86_StdCall:
4379 case CallingConv::X86_FastCall:
4380 case CallingConv::X86_ThisCall:
4381 case CallingConv::X86_VectorCall:
4386 /// \brief Return true if the condition is an unsigned comparison operation.
4387 static bool isX86CCUnsigned(unsigned X86CC) {
4390 llvm_unreachable("Invalid integer condition!");
4406 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4407 switch (SetCCOpcode) {
4408 default: llvm_unreachable("Invalid integer condition!");
4409 case ISD::SETEQ: return X86::COND_E;
4410 case ISD::SETGT: return X86::COND_G;
4411 case ISD::SETGE: return X86::COND_GE;
4412 case ISD::SETLT: return X86::COND_L;
4413 case ISD::SETLE: return X86::COND_LE;
4414 case ISD::SETNE: return X86::COND_NE;
4415 case ISD::SETULT: return X86::COND_B;
4416 case ISD::SETUGT: return X86::COND_A;
4417 case ISD::SETULE: return X86::COND_BE;
4418 case ISD::SETUGE: return X86::COND_AE;
4422 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4423 /// condition code, returning the condition code and the LHS/RHS of the
4424 /// comparison to make.
4425 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4426 bool isFP, SDValue &LHS, SDValue &RHS,
4427 SelectionDAG &DAG) {
4429 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4430 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4431 // X > -1 -> X == 0, jump !sign.
4432 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4433 return X86::COND_NS;
4435 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4436 // X < 0 -> X == 0, jump on sign.
4439 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4441 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4442 return X86::COND_LE;
4446 return TranslateIntegerX86CC(SetCCOpcode);
4449 // First determine if it is required or is profitable to flip the operands.
4451 // If LHS is a foldable load, but RHS is not, flip the condition.
4452 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4453 !ISD::isNON_EXTLoad(RHS.getNode())) {
4454 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4455 std::swap(LHS, RHS);
4458 switch (SetCCOpcode) {
4464 std::swap(LHS, RHS);
4468 // On a floating point condition, the flags are set as follows:
4470 // 0 | 0 | 0 | X > Y
4471 // 0 | 0 | 1 | X < Y
4472 // 1 | 0 | 0 | X == Y
4473 // 1 | 1 | 1 | unordered
4474 switch (SetCCOpcode) {
4475 default: llvm_unreachable("Condcode should be pre-legalized away");
4477 case ISD::SETEQ: return X86::COND_E;
4478 case ISD::SETOLT: // flipped
4480 case ISD::SETGT: return X86::COND_A;
4481 case ISD::SETOLE: // flipped
4483 case ISD::SETGE: return X86::COND_AE;
4484 case ISD::SETUGT: // flipped
4486 case ISD::SETLT: return X86::COND_B;
4487 case ISD::SETUGE: // flipped
4489 case ISD::SETLE: return X86::COND_BE;
4491 case ISD::SETNE: return X86::COND_NE;
4492 case ISD::SETUO: return X86::COND_P;
4493 case ISD::SETO: return X86::COND_NP;
4495 case ISD::SETUNE: return X86::COND_INVALID;
4499 /// Is there a floating point cmov for the specific X86 condition code?
4500 /// Current x86 isa includes the following FP cmov instructions:
4501 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4502 static bool hasFPCMov(unsigned X86CC) {
4519 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4521 unsigned Intrinsic) const {
4523 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4527 Info.opc = ISD::INTRINSIC_W_CHAIN;
4528 Info.readMem = false;
4529 Info.writeMem = false;
4533 switch (IntrData->Type) {
4534 case EXPAND_FROM_MEM: {
4535 Info.ptrVal = I.getArgOperand(0);
4536 Info.memVT = MVT::getVT(I.getType());
4538 Info.readMem = true;
4541 case COMPRESS_TO_MEM: {
4542 Info.ptrVal = I.getArgOperand(0);
4543 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4545 Info.writeMem = true;
4548 case TRUNCATE_TO_MEM_VI8:
4549 case TRUNCATE_TO_MEM_VI16:
4550 case TRUNCATE_TO_MEM_VI32: {
4551 Info.ptrVal = I.getArgOperand(0);
4552 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4553 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4554 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4556 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4557 ScalarVT = MVT::i16;
4558 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4559 ScalarVT = MVT::i32;
4561 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4563 Info.writeMem = true;
4573 /// Returns true if the target can instruction select the
4574 /// specified FP immediate natively. If false, the legalizer will
4575 /// materialize the FP immediate as a load from a constant pool.
4576 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4577 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4578 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4584 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4585 ISD::LoadExtType ExtTy,
4587 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4588 // relocation target a movq or addq instruction: don't let the load shrink.
4589 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4590 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4591 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4592 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4596 /// \brief Returns true if it is beneficial to convert a load of a constant
4597 /// to just the constant itself.
4598 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4600 assert(Ty->isIntegerTy());
4602 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4603 if (BitSize == 0 || BitSize > 64)
4608 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4609 unsigned Index) const {
4610 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4613 return (Index == 0 || Index == ResVT.getVectorNumElements());
4616 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4617 // Speculate cttz only if we can directly use TZCNT.
4618 return Subtarget.hasBMI();
4621 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4622 // Speculate ctlz only if we can directly use LZCNT.
4623 return Subtarget.hasLZCNT();
4626 bool X86TargetLowering::isCtlzFast() const {
4627 return Subtarget.hasFastLZCNT();
4630 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4631 const Instruction &AndI) const {
4635 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4636 if (!Subtarget.hasBMI())
4639 // There are only 32-bit and 64-bit forms for 'andn'.
4640 EVT VT = Y.getValueType();
4641 if (VT != MVT::i32 && VT != MVT::i64)
4647 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4648 MVT VT = MVT::getIntegerVT(NumBits);
4649 if (isTypeLegal(VT))
4652 // PMOVMSKB can handle this.
4653 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4656 // VPMOVMSKB can handle this.
4657 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4660 // TODO: Allow 64-bit type for 32-bit target.
4661 // TODO: 512-bit types should be allowed, but make sure that those
4662 // cases are handled in combineVectorSizedSetCCEquality().
4664 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4667 /// Val is the undef sentinel value or equal to the specified value.
4668 static bool isUndefOrEqual(int Val, int CmpVal) {
4669 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4672 /// Val is either the undef or zero sentinel value.
4673 static bool isUndefOrZero(int Val) {
4674 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4677 /// Return true if every element in Mask, beginning
4678 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4679 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4680 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4681 if (Mask[i] != SM_SentinelUndef)
4686 /// Return true if Val is undef or if its value falls within the
4687 /// specified range (L, H].
4688 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4689 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4692 /// Return true if every element in Mask is undef or if its value
4693 /// falls within the specified range (L, H].
4694 static bool isUndefOrInRange(ArrayRef<int> Mask,
4697 if (!isUndefOrInRange(M, Low, Hi))
4702 /// Return true if Val is undef, zero or if its value falls within the
4703 /// specified range (L, H].
4704 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4705 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4708 /// Return true if every element in Mask is undef, zero or if its value
4709 /// falls within the specified range (L, H].
4710 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4712 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4717 /// Return true if every element in Mask, beginning
4718 /// from position Pos and ending in Pos+Size, falls within the specified
4719 /// sequential range (Low, Low+Size]. or is undef.
4720 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4721 unsigned Pos, unsigned Size, int Low) {
4722 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4723 if (!isUndefOrEqual(Mask[i], Low))
4728 /// Return true if every element in Mask, beginning
4729 /// from position Pos and ending in Pos+Size, falls within the specified
4730 /// sequential range (Low, Low+Size], or is undef or is zero.
4731 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4732 unsigned Size, int Low) {
4733 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4734 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4739 /// Return true if every element in Mask, beginning
4740 /// from position Pos and ending in Pos+Size is undef or is zero.
4741 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4743 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4744 if (!isUndefOrZero(Mask[i]))
4749 /// \brief Helper function to test whether a shuffle mask could be
4750 /// simplified by widening the elements being shuffled.
4752 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4753 /// leaves it in an unspecified state.
4755 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4756 /// shuffle masks. The latter have the special property of a '-2' representing
4757 /// a zero-ed lane of a vector.
4758 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4759 SmallVectorImpl<int> &WidenedMask) {
4760 WidenedMask.assign(Mask.size() / 2, 0);
4761 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4763 int M1 = Mask[i + 1];
4765 // If both elements are undef, its trivial.
4766 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4767 WidenedMask[i / 2] = SM_SentinelUndef;
4771 // Check for an undef mask and a mask value properly aligned to fit with
4772 // a pair of values. If we find such a case, use the non-undef mask's value.
4773 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4774 WidenedMask[i / 2] = M1 / 2;
4777 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4778 WidenedMask[i / 2] = M0 / 2;
4782 // When zeroing, we need to spread the zeroing across both lanes to widen.
4783 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4784 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4785 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4786 WidenedMask[i / 2] = SM_SentinelZero;
4792 // Finally check if the two mask values are adjacent and aligned with
4794 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4795 WidenedMask[i / 2] = M0 / 2;
4799 // Otherwise we can't safely widen the elements used in this shuffle.
4802 assert(WidenedMask.size() == Mask.size() / 2 &&
4803 "Incorrect size of mask after widening the elements!");
4808 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4809 /// mask index with the scaled sequential indices for an equivalent narrowed
4810 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4812 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4813 SmallVectorImpl<int> &ScaledMask) {
4814 assert(0 < Scale && "Unexpected scaling factor");
4815 int NumElts = Mask.size();
4816 ScaledMask.assign(NumElts * Scale, -1);
4818 for (int i = 0; i != NumElts; ++i) {
4821 // Repeat sentinel values in every mask element.
4823 for (int s = 0; s != Scale; ++s)
4824 ScaledMask[(Scale * i) + s] = M;
4828 // Scale mask element and increment across each mask element.
4829 for (int s = 0; s != Scale; ++s)
4830 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4834 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4835 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4836 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4837 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4838 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4841 // The index should be aligned on a vecWidth-bit boundary.
4843 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4845 MVT VT = N->getSimpleValueType(0);
4846 unsigned ElSize = VT.getScalarSizeInBits();
4847 bool Result = (Index * ElSize) % vecWidth == 0;
4852 /// Return true if the specified INSERT_SUBVECTOR
4853 /// operand specifies a subvector insert that is suitable for input to
4854 /// insertion of 128 or 256-bit subvectors
4855 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4856 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4857 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4859 // The index should be aligned on a vecWidth-bit boundary.
4861 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4863 MVT VT = N->getSimpleValueType(0);
4864 unsigned ElSize = VT.getScalarSizeInBits();
4865 bool Result = (Index * ElSize) % vecWidth == 0;
4870 bool X86::isVINSERT128Index(SDNode *N) {
4871 return isVINSERTIndex(N, 128);
4874 bool X86::isVINSERT256Index(SDNode *N) {
4875 return isVINSERTIndex(N, 256);
4878 bool X86::isVEXTRACT128Index(SDNode *N) {
4879 return isVEXTRACTIndex(N, 128);
4882 bool X86::isVEXTRACT256Index(SDNode *N) {
4883 return isVEXTRACTIndex(N, 256);
4886 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4887 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4888 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4889 "Illegal extract subvector for VEXTRACT");
4892 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4894 MVT VecVT = N->getOperand(0).getSimpleValueType();
4895 MVT ElVT = VecVT.getVectorElementType();
4897 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4898 return Index / NumElemsPerChunk;
4901 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4902 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4903 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4904 "Illegal insert subvector for VINSERT");
4907 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4909 MVT VecVT = N->getSimpleValueType(0);
4910 MVT ElVT = VecVT.getVectorElementType();
4912 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4913 return Index / NumElemsPerChunk;
4916 /// Return the appropriate immediate to extract the specified
4917 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4918 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4919 return getExtractVEXTRACTImmediate(N, 128);
4922 /// Return the appropriate immediate to extract the specified
4923 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4924 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4925 return getExtractVEXTRACTImmediate(N, 256);
4928 /// Return the appropriate immediate to insert at the specified
4929 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4930 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4931 return getInsertVINSERTImmediate(N, 128);
4934 /// Return the appropriate immediate to insert at the specified
4935 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4936 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4937 return getInsertVINSERTImmediate(N, 256);
4940 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4941 bool X86::isZeroNode(SDValue Elt) {
4942 return isNullConstant(Elt) || isNullFPConstant(Elt);
4945 // Build a vector of constants
4946 // Use an UNDEF node if MaskElt == -1.
4947 // Spilt 64-bit constants in the 32-bit mode.
4948 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4949 const SDLoc &dl, bool IsMask = false) {
4951 SmallVector<SDValue, 32> Ops;
4954 MVT ConstVecVT = VT;
4955 unsigned NumElts = VT.getVectorNumElements();
4956 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4957 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4958 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4962 MVT EltVT = ConstVecVT.getVectorElementType();
4963 for (unsigned i = 0; i < NumElts; ++i) {
4964 bool IsUndef = Values[i] < 0 && IsMask;
4965 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4966 DAG.getConstant(Values[i], dl, EltVT);
4967 Ops.push_back(OpNode);
4969 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4970 DAG.getConstant(0, dl, EltVT));
4972 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4978 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4979 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4980 assert(Bits.size() == Undefs.getBitWidth() &&
4981 "Unequal constant and undef arrays");
4982 SmallVector<SDValue, 32> Ops;
4985 MVT ConstVecVT = VT;
4986 unsigned NumElts = VT.getVectorNumElements();
4987 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4988 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4989 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4993 MVT EltVT = ConstVecVT.getVectorElementType();
4994 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4996 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4999 const APInt &V = Bits[i];
5000 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5002 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5003 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5004 } else if (EltVT == MVT::f32) {
5005 APFloat FV(APFloat::IEEEsingle(), V);
5006 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5007 } else if (EltVT == MVT::f64) {
5008 APFloat FV(APFloat::IEEEdouble(), V);
5009 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5011 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5015 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5016 return DAG.getBitcast(VT, ConstsNode);
5019 /// Returns a vector of specified type with all zero elements.
5020 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5021 SelectionDAG &DAG, const SDLoc &dl) {
5022 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5023 VT.getVectorElementType() == MVT::i1) &&
5024 "Unexpected vector type");
5026 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5027 // type. This ensures they get CSE'd. But if the integer type is not
5028 // available, use a floating-point +0.0 instead.
5030 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5031 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5032 } else if (VT.getVectorElementType() == MVT::i1) {
5033 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5034 "Unexpected vector type");
5035 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
5036 "Unexpected vector type");
5037 Vec = DAG.getConstant(0, dl, VT);
5039 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5040 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5042 return DAG.getBitcast(VT, Vec);
5045 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5046 const SDLoc &dl, unsigned vectorWidth) {
5047 EVT VT = Vec.getValueType();
5048 EVT ElVT = VT.getVectorElementType();
5049 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5050 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5051 VT.getVectorNumElements()/Factor);
5053 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5054 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5055 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5057 // This is the index of the first element of the vectorWidth-bit chunk
5058 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5059 IdxVal &= ~(ElemsPerChunk - 1);
5061 // If the input is a buildvector just emit a smaller one.
5062 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5063 return DAG.getBuildVector(
5064 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5066 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5067 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5070 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5071 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5072 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5073 /// instructions or a simple subregister reference. Idx is an index in the
5074 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5075 /// lowering EXTRACT_VECTOR_ELT operations easier.
5076 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5077 SelectionDAG &DAG, const SDLoc &dl) {
5078 assert((Vec.getValueType().is256BitVector() ||
5079 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5080 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5083 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5084 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5085 SelectionDAG &DAG, const SDLoc &dl) {
5086 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5087 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5090 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5091 SelectionDAG &DAG, const SDLoc &dl,
5092 unsigned vectorWidth) {
5093 assert((vectorWidth == 128 || vectorWidth == 256) &&
5094 "Unsupported vector width");
5095 // Inserting UNDEF is Result
5098 EVT VT = Vec.getValueType();
5099 EVT ElVT = VT.getVectorElementType();
5100 EVT ResultVT = Result.getValueType();
5102 // Insert the relevant vectorWidth bits.
5103 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5104 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5106 // This is the index of the first element of the vectorWidth-bit chunk
5107 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5108 IdxVal &= ~(ElemsPerChunk - 1);
5110 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5111 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5114 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5115 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5116 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5117 /// simple superregister reference. Idx is an index in the 128 bits
5118 /// we want. It need not be aligned to a 128-bit boundary. That makes
5119 /// lowering INSERT_VECTOR_ELT operations easier.
5120 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5121 SelectionDAG &DAG, const SDLoc &dl) {
5122 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5123 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5126 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5127 SelectionDAG &DAG, const SDLoc &dl) {
5128 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5129 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5132 /// Insert i1-subvector to i1-vector.
5133 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5134 const X86Subtarget &Subtarget) {
5137 SDValue Vec = Op.getOperand(0);
5138 SDValue SubVec = Op.getOperand(1);
5139 SDValue Idx = Op.getOperand(2);
5141 if (!isa<ConstantSDNode>(Idx))
5144 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5145 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5148 MVT OpVT = Op.getSimpleValueType();
5149 MVT SubVecVT = SubVec.getSimpleValueType();
5150 unsigned NumElems = OpVT.getVectorNumElements();
5151 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5153 assert(IdxVal + SubVecNumElems <= NumElems &&
5154 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5155 "Unexpected index value in INSERT_SUBVECTOR");
5157 // There are 3 possible cases:
5158 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5159 // 2. Subvector should be inserted in the upper part
5160 // (IdxVal + SubVecNumElems == NumElems)
5161 // 3. Subvector should be inserted in the middle (for example v2i1
5162 // to v16i1, index 2)
5164 // extend to natively supported kshift
5165 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5166 MVT WideOpVT = OpVT;
5167 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5170 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5171 SDValue Undef = DAG.getUNDEF(WideOpVT);
5172 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5173 Undef, SubVec, ZeroIdx);
5175 // Extract sub-vector if require.
5176 auto ExtractSubVec = [&](SDValue V) {
5177 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5181 if (Vec.isUndef()) {
5183 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5184 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5187 return ExtractSubVec(WideSubVec);
5190 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5191 NumElems = WideOpVT.getVectorNumElements();
5192 unsigned ShiftLeft = NumElems - SubVecNumElems;
5193 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5194 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5195 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5196 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5197 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5198 return ExtractSubVec(Vec);
5202 // Zero lower bits of the Vec
5203 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5204 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5205 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5206 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5207 // Merge them together, SubVec should be zero extended.
5208 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5209 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5211 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5212 return ExtractSubVec(Vec);
5215 // Simple case when we put subvector in the upper part
5216 if (IdxVal + SubVecNumElems == NumElems) {
5217 // Zero upper bits of the Vec
5218 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5219 DAG.getConstant(IdxVal, dl, MVT::i8));
5220 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5221 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5222 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5223 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5224 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5225 return ExtractSubVec(Vec);
5227 // Subvector should be inserted in the middle - use shuffle
5228 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5230 SmallVector<int, 64> Mask;
5231 for (unsigned i = 0; i < NumElems; ++i)
5232 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5234 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5237 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5238 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5239 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5240 /// large BUILD_VECTORS.
5241 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5242 unsigned NumElems, SelectionDAG &DAG,
5244 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5245 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5248 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5249 unsigned NumElems, SelectionDAG &DAG,
5251 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5252 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5255 /// Returns a vector of specified type with all bits set.
5256 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5257 /// Then bitcast to their original type, ensuring they get CSE'd.
5258 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5259 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5260 "Expected a 128/256/512-bit vector type");
5262 APInt Ones = APInt::getAllOnesValue(32);
5263 unsigned NumElts = VT.getSizeInBits() / 32;
5264 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5265 return DAG.getBitcast(VT, Vec);
5268 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5269 SelectionDAG &DAG) {
5270 EVT InVT = In.getValueType();
5271 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5273 if (VT.is128BitVector() && InVT.is128BitVector())
5274 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5275 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5277 // For 256-bit vectors, we only need the lower (128-bit) input half.
5278 // For 512-bit vectors, we only need the lower input half or quarter.
5279 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5280 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5281 In = extractSubVector(In, 0, DAG, DL,
5282 std::max(128, (int)VT.getSizeInBits() / Scale));
5285 return DAG.getNode(Opc, DL, VT, In);
5288 /// Generate unpacklo/unpackhi shuffle mask.
5289 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5291 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5292 int NumElts = VT.getVectorNumElements();
5293 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5295 for (int i = 0; i < NumElts; ++i) {
5296 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5297 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5298 Pos += (Unary ? 0 : NumElts * (i % 2));
5299 Pos += (Lo ? 0 : NumEltsInLane / 2);
5300 Mask.push_back(Pos);
5304 /// Returns a vector_shuffle node for an unpackl operation.
5305 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5306 SDValue V1, SDValue V2) {
5307 SmallVector<int, 8> Mask;
5308 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5309 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5312 /// Returns a vector_shuffle node for an unpackh operation.
5313 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5314 SDValue V1, SDValue V2) {
5315 SmallVector<int, 8> Mask;
5316 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5317 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5320 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5321 /// This produces a shuffle where the low element of V2 is swizzled into the
5322 /// zero/undef vector, landing at element Idx.
5323 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5324 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5326 const X86Subtarget &Subtarget,
5327 SelectionDAG &DAG) {
5328 MVT VT = V2.getSimpleValueType();
5330 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5331 int NumElems = VT.getVectorNumElements();
5332 SmallVector<int, 16> MaskVec(NumElems);
5333 for (int i = 0; i != NumElems; ++i)
5334 // If this is the insertion idx, put the low elt of V2 here.
5335 MaskVec[i] = (i == Idx) ? NumElems : i;
5336 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5339 static SDValue peekThroughBitcasts(SDValue V) {
5340 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5341 V = V.getOperand(0);
5345 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5346 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5347 V.getOperand(0).hasOneUse())
5348 V = V.getOperand(0);
5352 static const Constant *getTargetConstantFromNode(SDValue Op) {
5353 Op = peekThroughBitcasts(Op);
5355 auto *Load = dyn_cast<LoadSDNode>(Op);
5359 SDValue Ptr = Load->getBasePtr();
5360 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5361 Ptr->getOpcode() == X86ISD::WrapperRIP)
5362 Ptr = Ptr->getOperand(0);
5364 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5365 if (!CNode || CNode->isMachineConstantPoolEntry())
5368 return dyn_cast<Constant>(CNode->getConstVal());
5371 // Extract raw constant bits from constant pools.
5372 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5374 SmallVectorImpl<APInt> &EltBits,
5375 bool AllowWholeUndefs = true,
5376 bool AllowPartialUndefs = true) {
5377 assert(EltBits.empty() && "Expected an empty EltBits vector");
5379 Op = peekThroughBitcasts(Op);
5381 EVT VT = Op.getValueType();
5382 unsigned SizeInBits = VT.getSizeInBits();
5383 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5384 unsigned NumElts = SizeInBits / EltSizeInBits;
5386 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5387 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5389 // Extract all the undef/constant element data and pack into single bitsets.
5390 APInt UndefBits(SizeInBits, 0);
5391 APInt MaskBits(SizeInBits, 0);
5393 // Split the undef/constant single bitset data into the target elements.
5394 auto SplitBitData = [&]() {
5395 // Don't split if we don't allow undef bits.
5396 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5397 if (UndefBits.getBoolValue() && !AllowUndefs)
5400 UndefElts = APInt(NumElts, 0);
5401 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5403 for (unsigned i = 0; i != NumElts; ++i) {
5404 unsigned BitOffset = i * EltSizeInBits;
5405 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5407 // Only treat an element as UNDEF if all bits are UNDEF.
5408 if (UndefEltBits.isAllOnesValue()) {
5409 if (!AllowWholeUndefs)
5411 UndefElts.setBit(i);
5415 // If only some bits are UNDEF then treat them as zero (or bail if not
5417 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5420 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5421 EltBits[i] = Bits.getZExtValue();
5426 // Collect constant bits and insert into mask/undef bit masks.
5427 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5428 unsigned BitOffset) {
5431 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5432 if (isa<UndefValue>(Cst)) {
5433 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5436 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5437 Mask.insertBits(CInt->getValue(), BitOffset);
5440 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5441 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5447 // Extract constant bits from build vector.
5448 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5449 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5450 const SDValue &Src = Op.getOperand(i);
5451 unsigned BitOffset = i * SrcEltSizeInBits;
5452 if (Src.isUndef()) {
5453 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5456 auto *Cst = cast<ConstantSDNode>(Src);
5457 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5458 MaskBits.insertBits(Bits, BitOffset);
5460 return SplitBitData();
5463 // Extract constant bits from constant pool vector.
5464 if (auto *Cst = getTargetConstantFromNode(Op)) {
5465 Type *CstTy = Cst->getType();
5466 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5469 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5470 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5471 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5472 i * CstEltSizeInBits))
5475 return SplitBitData();
5478 // Extract constant bits from a broadcasted constant pool scalar.
5479 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5480 EltSizeInBits <= SrcEltSizeInBits) {
5481 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5482 APInt Bits(SizeInBits, 0);
5483 APInt Undefs(SizeInBits, 0);
5484 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5485 for (unsigned i = 0; i != NumSrcElts; ++i) {
5486 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5487 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5489 return SplitBitData();
5494 // Extract a rematerialized scalar constant insertion.
5495 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5496 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5497 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5498 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5499 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5500 MaskBits = MaskBits.zext(SizeInBits);
5501 return SplitBitData();
5507 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5508 unsigned MaskEltSizeInBits,
5509 SmallVectorImpl<uint64_t> &RawMask) {
5511 SmallVector<APInt, 64> EltBits;
5513 // Extract the raw target constant bits.
5514 // FIXME: We currently don't support UNDEF bits or mask entries.
5515 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5516 EltBits, /* AllowWholeUndefs */ false,
5517 /* AllowPartialUndefs */ false))
5520 // Insert the extracted elements into the mask.
5521 for (APInt Elt : EltBits)
5522 RawMask.push_back(Elt.getZExtValue());
5527 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5528 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5529 /// operands in \p Ops, and returns true.
5530 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5531 /// IsUnary for shuffles which use a single input multiple times, and in those
5532 /// cases it will adjust the mask to only have indices within that single input.
5533 /// It is an error to call this with non-empty Mask/Ops vectors.
5534 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5535 SmallVectorImpl<SDValue> &Ops,
5536 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5537 unsigned NumElems = VT.getVectorNumElements();
5540 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5541 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5544 bool IsFakeUnary = false;
5545 switch(N->getOpcode()) {
5546 case X86ISD::BLENDI:
5547 ImmN = N->getOperand(N->getNumOperands()-1);
5548 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5549 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5552 ImmN = N->getOperand(N->getNumOperands()-1);
5553 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5554 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5556 case X86ISD::INSERTPS:
5557 ImmN = N->getOperand(N->getNumOperands()-1);
5558 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5559 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5561 case X86ISD::UNPCKH:
5562 DecodeUNPCKHMask(VT, Mask);
5563 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5565 case X86ISD::UNPCKL:
5566 DecodeUNPCKLMask(VT, Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5569 case X86ISD::MOVHLPS:
5570 DecodeMOVHLPSMask(NumElems, Mask);
5571 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5573 case X86ISD::MOVLHPS:
5574 DecodeMOVLHPSMask(NumElems, Mask);
5575 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5577 case X86ISD::PALIGNR:
5578 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5579 ImmN = N->getOperand(N->getNumOperands()-1);
5580 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5581 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5582 Ops.push_back(N->getOperand(1));
5583 Ops.push_back(N->getOperand(0));
5585 case X86ISD::VSHLDQ:
5586 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5587 ImmN = N->getOperand(N->getNumOperands() - 1);
5588 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5591 case X86ISD::VSRLDQ:
5592 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5593 ImmN = N->getOperand(N->getNumOperands() - 1);
5594 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5597 case X86ISD::PSHUFD:
5598 case X86ISD::VPERMILPI:
5599 ImmN = N->getOperand(N->getNumOperands()-1);
5600 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5603 case X86ISD::PSHUFHW:
5604 ImmN = N->getOperand(N->getNumOperands()-1);
5605 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5608 case X86ISD::PSHUFLW:
5609 ImmN = N->getOperand(N->getNumOperands()-1);
5610 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5613 case X86ISD::VZEXT_MOVL:
5614 DecodeZeroMoveLowMask(VT, Mask);
5617 case X86ISD::VBROADCAST: {
5618 SDValue N0 = N->getOperand(0);
5619 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5620 // add the pre-extracted value to the Ops vector.
5621 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5622 N0.getOperand(0).getValueType() == VT &&
5623 N0.getConstantOperandVal(1) == 0)
5624 Ops.push_back(N0.getOperand(0));
5626 // We only decode broadcasts of same-sized vectors, unless the broadcast
5627 // came from an extract from the original width. If we found one, we
5628 // pushed it the Ops vector above.
5629 if (N0.getValueType() == VT || !Ops.empty()) {
5630 DecodeVectorBroadcast(VT, Mask);
5636 case X86ISD::VPERMILPV: {
5638 SDValue MaskNode = N->getOperand(1);
5639 unsigned MaskEltSize = VT.getScalarSizeInBits();
5640 SmallVector<uint64_t, 32> RawMask;
5641 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5642 DecodeVPERMILPMask(VT, RawMask, Mask);
5645 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5646 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5651 case X86ISD::PSHUFB: {
5653 SDValue MaskNode = N->getOperand(1);
5654 SmallVector<uint64_t, 32> RawMask;
5655 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5656 DecodePSHUFBMask(RawMask, Mask);
5659 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5660 DecodePSHUFBMask(C, Mask);
5665 case X86ISD::VPERMI:
5666 ImmN = N->getOperand(N->getNumOperands()-1);
5667 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5672 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5674 case X86ISD::VPERM2X128:
5675 ImmN = N->getOperand(N->getNumOperands()-1);
5676 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5677 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5679 case X86ISD::MOVSLDUP:
5680 DecodeMOVSLDUPMask(VT, Mask);
5683 case X86ISD::MOVSHDUP:
5684 DecodeMOVSHDUPMask(VT, Mask);
5687 case X86ISD::MOVDDUP:
5688 DecodeMOVDDUPMask(VT, Mask);
5691 case X86ISD::MOVLHPD:
5692 case X86ISD::MOVLPD:
5693 case X86ISD::MOVLPS:
5694 // Not yet implemented
5696 case X86ISD::VPERMIL2: {
5697 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5698 unsigned MaskEltSize = VT.getScalarSizeInBits();
5699 SDValue MaskNode = N->getOperand(2);
5700 SDValue CtrlNode = N->getOperand(3);
5701 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5702 unsigned CtrlImm = CtrlOp->getZExtValue();
5703 SmallVector<uint64_t, 32> RawMask;
5704 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5705 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5708 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5709 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5715 case X86ISD::VPPERM: {
5716 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5717 SDValue MaskNode = N->getOperand(2);
5718 SmallVector<uint64_t, 32> RawMask;
5719 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5720 DecodeVPPERMMask(RawMask, Mask);
5723 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5724 DecodeVPPERMMask(C, Mask);
5729 case X86ISD::VPERMV: {
5731 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5732 Ops.push_back(N->getOperand(1));
5733 SDValue MaskNode = N->getOperand(0);
5734 SmallVector<uint64_t, 32> RawMask;
5735 unsigned MaskEltSize = VT.getScalarSizeInBits();
5736 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5737 DecodeVPERMVMask(RawMask, Mask);
5740 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5741 DecodeVPERMVMask(C, MaskEltSize, Mask);
5746 case X86ISD::VPERMV3: {
5747 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5748 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5749 Ops.push_back(N->getOperand(0));
5750 Ops.push_back(N->getOperand(2));
5751 SDValue MaskNode = N->getOperand(1);
5752 unsigned MaskEltSize = VT.getScalarSizeInBits();
5753 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5754 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5759 case X86ISD::VPERMIV3: {
5760 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5761 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5762 Ops.push_back(N->getOperand(1));
5763 Ops.push_back(N->getOperand(2));
5764 SDValue MaskNode = N->getOperand(0);
5765 unsigned MaskEltSize = VT.getScalarSizeInBits();
5766 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5767 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5772 default: llvm_unreachable("unknown target shuffle node");
5775 // Empty mask indicates the decode failed.
5779 // Check if we're getting a shuffle mask with zero'd elements.
5780 if (!AllowSentinelZero)
5781 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5784 // If we have a fake unary shuffle, the shuffle mask is spread across two
5785 // inputs that are actually the same node. Re-map the mask to always point
5786 // into the first input.
5789 if (M >= (int)Mask.size())
5792 // If we didn't already add operands in the opcode-specific code, default to
5793 // adding 1 or 2 operands starting at 0.
5795 Ops.push_back(N->getOperand(0));
5796 if (!IsUnary || IsFakeUnary)
5797 Ops.push_back(N->getOperand(1));
5803 /// Check a target shuffle mask's inputs to see if we can set any values to
5804 /// SM_SentinelZero - this is for elements that are known to be zero
5805 /// (not just zeroable) from their inputs.
5806 /// Returns true if the target shuffle mask was decoded.
5807 static bool setTargetShuffleZeroElements(SDValue N,
5808 SmallVectorImpl<int> &Mask,
5809 SmallVectorImpl<SDValue> &Ops) {
5811 if (!isTargetShuffle(N.getOpcode()))
5814 MVT VT = N.getSimpleValueType();
5815 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5818 SDValue V1 = Ops[0];
5819 SDValue V2 = IsUnary ? V1 : Ops[1];
5821 V1 = peekThroughBitcasts(V1);
5822 V2 = peekThroughBitcasts(V2);
5824 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5825 "Illegal split of shuffle value type");
5826 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5828 // Extract known constant input data.
5829 APInt UndefSrcElts[2];
5830 SmallVector<APInt, 32> SrcEltBits[2];
5831 bool IsSrcConstant[2] = {
5832 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5833 SrcEltBits[0], true, false),
5834 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5835 SrcEltBits[1], true, false)};
5837 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5840 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5844 // Determine shuffle input and normalize the mask.
5845 unsigned SrcIdx = M / Size;
5846 SDValue V = M < Size ? V1 : V2;
5849 // We are referencing an UNDEF input.
5851 Mask[i] = SM_SentinelUndef;
5855 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5856 // TODO: We currently only set UNDEF for integer types - floats use the same
5857 // registers as vectors and many of the scalar folded loads rely on the
5858 // SCALAR_TO_VECTOR pattern.
5859 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5860 (Size % V.getValueType().getVectorNumElements()) == 0) {
5861 int Scale = Size / V.getValueType().getVectorNumElements();
5862 int Idx = M / Scale;
5863 if (Idx != 0 && !VT.isFloatingPoint())
5864 Mask[i] = SM_SentinelUndef;
5865 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5866 Mask[i] = SM_SentinelZero;
5870 // Attempt to extract from the source's constant bits.
5871 if (IsSrcConstant[SrcIdx]) {
5872 if (UndefSrcElts[SrcIdx][M])
5873 Mask[i] = SM_SentinelUndef;
5874 else if (SrcEltBits[SrcIdx][M] == 0)
5875 Mask[i] = SM_SentinelZero;
5879 assert(VT.getVectorNumElements() == Mask.size() &&
5880 "Different mask size from vector size!");
5884 // Attempt to decode ops that could be represented as a shuffle mask.
5885 // The decoded shuffle mask may contain a different number of elements to the
5886 // destination value type.
5887 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5888 SmallVectorImpl<SDValue> &Ops) {
5892 MVT VT = N.getSimpleValueType();
5893 unsigned NumElts = VT.getVectorNumElements();
5894 unsigned NumSizeInBits = VT.getSizeInBits();
5895 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5896 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5897 "Expected byte aligned value types");
5899 unsigned Opcode = N.getOpcode();
5902 case X86ISD::ANDNP: {
5903 // Attempt to decode as a per-byte mask.
5905 SmallVector<APInt, 32> EltBits;
5906 SDValue N0 = N.getOperand(0);
5907 SDValue N1 = N.getOperand(1);
5908 bool IsAndN = (X86ISD::ANDNP == Opcode);
5909 uint64_t ZeroMask = IsAndN ? 255 : 0;
5910 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5912 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5914 Mask.push_back(SM_SentinelUndef);
5917 uint64_t ByteBits = EltBits[i].getZExtValue();
5918 if (ByteBits != 0 && ByteBits != 255)
5920 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5922 Ops.push_back(IsAndN ? N1 : N0);
5925 case ISD::SCALAR_TO_VECTOR: {
5926 // Match against a scalar_to_vector of an extract from a similar vector.
5927 SDValue N0 = N.getOperand(0);
5928 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5929 N0.getOperand(0).getValueType() != VT ||
5930 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5931 NumElts <= N0.getConstantOperandVal(1) ||
5932 !N->isOnlyUserOf(N0.getNode()))
5934 Ops.push_back(N0.getOperand(0));
5935 Mask.push_back(N0.getConstantOperandVal(1));
5936 Mask.append(NumElts - 1, SM_SentinelUndef);
5939 case X86ISD::PINSRB:
5940 case X86ISD::PINSRW: {
5941 SDValue InVec = N.getOperand(0);
5942 SDValue InScl = N.getOperand(1);
5943 uint64_t InIdx = N.getConstantOperandVal(2);
5944 assert(InIdx < NumElts && "Illegal insertion index");
5946 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5947 if (X86::isZeroNode(InScl)) {
5948 Ops.push_back(InVec);
5949 for (unsigned i = 0; i != NumElts; ++i)
5950 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5954 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5955 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5957 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5958 if (InScl.getOpcode() != ISD::AssertZext ||
5959 InScl.getOperand(0).getOpcode() != ExOp)
5962 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5963 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5964 assert(ExIdx < NumElts && "Illegal extraction index");
5965 Ops.push_back(InVec);
5966 Ops.push_back(ExVec);
5967 for (unsigned i = 0; i != NumElts; ++i)
5968 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5972 case X86ISD::VSRLI: {
5973 uint64_t ShiftVal = N.getConstantOperandVal(1);
5974 // Out of range bit shifts are guaranteed to be zero.
5975 if (NumBitsPerElt <= ShiftVal) {
5976 Mask.append(NumElts, SM_SentinelZero);
5980 // We can only decode 'whole byte' bit shifts as shuffles.
5981 if ((ShiftVal % 8) != 0)
5984 uint64_t ByteShift = ShiftVal / 8;
5985 unsigned NumBytes = NumSizeInBits / 8;
5986 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5987 Ops.push_back(N.getOperand(0));
5989 // Clear mask to all zeros and insert the shifted byte indices.
5990 Mask.append(NumBytes, SM_SentinelZero);
5992 if (X86ISD::VSHLI == Opcode) {
5993 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5994 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5995 Mask[i + j] = i + j - ByteShift;
5997 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5998 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5999 Mask[i + j - ByteShift] = i + j;
6003 case ISD::ZERO_EXTEND_VECTOR_INREG:
6004 case X86ISD::VZEXT: {
6005 // TODO - add support for VPMOVZX with smaller input vector types.
6006 SDValue Src = N.getOperand(0);
6007 MVT SrcVT = Src.getSimpleValueType();
6008 if (NumSizeInBits != SrcVT.getSizeInBits())
6010 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6019 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6020 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6021 SmallVectorImpl<int> &Mask) {
6022 int MaskWidth = Mask.size();
6023 SmallVector<SDValue, 16> UsedInputs;
6024 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6025 int lo = UsedInputs.size() * MaskWidth;
6026 int hi = lo + MaskWidth;
6027 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6028 UsedInputs.push_back(Inputs[i]);
6035 Inputs = UsedInputs;
6038 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6039 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6040 /// remaining input indices in case we now have a unary shuffle and adjust the
6041 /// inputs accordingly.
6042 /// Returns true if the target shuffle mask was decoded.
6043 static bool resolveTargetShuffleInputs(SDValue Op,
6044 SmallVectorImpl<SDValue> &Inputs,
6045 SmallVectorImpl<int> &Mask) {
6046 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6047 if (!getFauxShuffleMask(Op, Mask, Inputs))
6050 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6054 /// Returns the scalar element that will make up the ith
6055 /// element of the result of the vector shuffle.
6056 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6059 return SDValue(); // Limit search depth.
6061 SDValue V = SDValue(N, 0);
6062 EVT VT = V.getValueType();
6063 unsigned Opcode = V.getOpcode();
6065 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6066 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6067 int Elt = SV->getMaskElt(Index);
6070 return DAG.getUNDEF(VT.getVectorElementType());
6072 unsigned NumElems = VT.getVectorNumElements();
6073 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6074 : SV->getOperand(1);
6075 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6078 // Recurse into target specific vector shuffles to find scalars.
6079 if (isTargetShuffle(Opcode)) {
6080 MVT ShufVT = V.getSimpleValueType();
6081 MVT ShufSVT = ShufVT.getVectorElementType();
6082 int NumElems = (int)ShufVT.getVectorNumElements();
6083 SmallVector<int, 16> ShuffleMask;
6084 SmallVector<SDValue, 16> ShuffleOps;
6087 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6090 int Elt = ShuffleMask[Index];
6091 if (Elt == SM_SentinelZero)
6092 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6093 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6094 if (Elt == SM_SentinelUndef)
6095 return DAG.getUNDEF(ShufSVT);
6097 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6098 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6099 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6103 // Actual nodes that may contain scalar elements
6104 if (Opcode == ISD::BITCAST) {
6105 V = V.getOperand(0);
6106 EVT SrcVT = V.getValueType();
6107 unsigned NumElems = VT.getVectorNumElements();
6109 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6113 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6114 return (Index == 0) ? V.getOperand(0)
6115 : DAG.getUNDEF(VT.getVectorElementType());
6117 if (V.getOpcode() == ISD::BUILD_VECTOR)
6118 return V.getOperand(Index);
6123 /// Custom lower build_vector of v16i8.
6124 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6125 unsigned NumNonZero, unsigned NumZero,
6127 const X86Subtarget &Subtarget) {
6135 // SSE4.1 - use PINSRB to insert each byte directly.
6136 if (Subtarget.hasSSE41()) {
6137 for (unsigned i = 0; i < 16; ++i) {
6138 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6140 // If the build vector contains zeros or our first insertion is not the
6141 // first index then insert into zero vector to break any register
6142 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6145 if (NumZero || 0 != i)
6146 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6148 assert(0 == i && "Expected insertion into zero-index");
6149 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6150 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6151 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6152 V = DAG.getBitcast(MVT::v16i8, V);
6156 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6157 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6164 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6165 for (unsigned i = 0; i < 16; ++i) {
6166 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6167 if (ThisIsNonZero && First) {
6169 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6171 V = DAG.getUNDEF(MVT::v8i16);
6176 // FIXME: Investigate extending to i32 instead of just i16.
6177 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6178 SDValue ThisElt, LastElt;
6179 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6180 if (LastIsNonZero) {
6182 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6184 if (ThisIsNonZero) {
6185 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6186 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6187 DAG.getConstant(8, dl, MVT::i8));
6189 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6195 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6196 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6197 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6198 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6199 V = DAG.getBitcast(MVT::v8i16, V);
6201 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6202 DAG.getIntPtrConstant(i / 2, dl));
6208 return DAG.getBitcast(MVT::v16i8, V);
6211 /// Custom lower build_vector of v8i16.
6212 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6213 unsigned NumNonZero, unsigned NumZero,
6215 const X86Subtarget &Subtarget) {
6222 for (unsigned i = 0; i < 8; ++i) {
6223 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6225 // If the build vector contains zeros or our first insertion is not the
6226 // first index then insert into zero vector to break any register
6227 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6230 if (NumZero || 0 != i)
6231 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6233 assert(0 == i && "Expected insertion into zero-index");
6234 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6235 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6236 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6237 V = DAG.getBitcast(MVT::v8i16, V);
6241 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6242 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6249 /// Custom lower build_vector of v4i32 or v4f32.
6250 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6251 const X86Subtarget &Subtarget) {
6252 // Find all zeroable elements.
6253 std::bitset<4> Zeroable;
6254 for (int i=0; i < 4; ++i) {
6255 SDValue Elt = Op->getOperand(i);
6256 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6258 assert(Zeroable.size() - Zeroable.count() > 1 &&
6259 "We expect at least two non-zero elements!");
6261 // We only know how to deal with build_vector nodes where elements are either
6262 // zeroable or extract_vector_elt with constant index.
6263 SDValue FirstNonZero;
6264 unsigned FirstNonZeroIdx;
6265 for (unsigned i=0; i < 4; ++i) {
6268 SDValue Elt = Op->getOperand(i);
6269 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6270 !isa<ConstantSDNode>(Elt.getOperand(1)))
6272 // Make sure that this node is extracting from a 128-bit vector.
6273 MVT VT = Elt.getOperand(0).getSimpleValueType();
6274 if (!VT.is128BitVector())
6276 if (!FirstNonZero.getNode()) {
6278 FirstNonZeroIdx = i;
6282 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6283 SDValue V1 = FirstNonZero.getOperand(0);
6284 MVT VT = V1.getSimpleValueType();
6286 // See if this build_vector can be lowered as a blend with zero.
6288 unsigned EltMaskIdx, EltIdx;
6290 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6291 if (Zeroable[EltIdx]) {
6292 // The zero vector will be on the right hand side.
6293 Mask[EltIdx] = EltIdx+4;
6297 Elt = Op->getOperand(EltIdx);
6298 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6299 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6300 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6302 Mask[EltIdx] = EltIdx;
6306 // Let the shuffle legalizer deal with blend operations.
6307 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6308 if (V1.getSimpleValueType() != VT)
6309 V1 = DAG.getBitcast(VT, V1);
6310 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6313 // See if we can lower this build_vector to a INSERTPS.
6314 if (!Subtarget.hasSSE41())
6317 SDValue V2 = Elt.getOperand(0);
6318 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6321 bool CanFold = true;
6322 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6326 SDValue Current = Op->getOperand(i);
6327 SDValue SrcVector = Current->getOperand(0);
6330 CanFold = SrcVector == V1 &&
6331 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6337 assert(V1.getNode() && "Expected at least two non-zero elements!");
6338 if (V1.getSimpleValueType() != MVT::v4f32)
6339 V1 = DAG.getBitcast(MVT::v4f32, V1);
6340 if (V2.getSimpleValueType() != MVT::v4f32)
6341 V2 = DAG.getBitcast(MVT::v4f32, V2);
6343 // Ok, we can emit an INSERTPS instruction.
6344 unsigned ZMask = Zeroable.to_ulong();
6346 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6347 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6349 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6350 DAG.getIntPtrConstant(InsertPSMask, DL));
6351 return DAG.getBitcast(VT, Result);
6354 /// Return a vector logical shift node.
6355 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6356 SelectionDAG &DAG, const TargetLowering &TLI,
6358 assert(VT.is128BitVector() && "Unknown type for VShift");
6359 MVT ShVT = MVT::v16i8;
6360 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6361 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6362 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6363 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6364 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6365 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6368 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6369 SelectionDAG &DAG) {
6371 // Check if the scalar load can be widened into a vector load. And if
6372 // the address is "base + cst" see if the cst can be "absorbed" into
6373 // the shuffle mask.
6374 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6375 SDValue Ptr = LD->getBasePtr();
6376 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6378 EVT PVT = LD->getValueType(0);
6379 if (PVT != MVT::i32 && PVT != MVT::f32)
6384 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6385 FI = FINode->getIndex();
6387 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6388 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6389 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6390 Offset = Ptr.getConstantOperandVal(1);
6391 Ptr = Ptr.getOperand(0);
6396 // FIXME: 256-bit vector instructions don't require a strict alignment,
6397 // improve this code to support it better.
6398 unsigned RequiredAlign = VT.getSizeInBits()/8;
6399 SDValue Chain = LD->getChain();
6400 // Make sure the stack object alignment is at least 16 or 32.
6401 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6402 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6403 if (MFI.isFixedObjectIndex(FI)) {
6404 // Can't change the alignment. FIXME: It's possible to compute
6405 // the exact stack offset and reference FI + adjust offset instead.
6406 // If someone *really* cares about this. That's the way to implement it.
6409 MFI.setObjectAlignment(FI, RequiredAlign);
6413 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6414 // Ptr + (Offset & ~15).
6417 if ((Offset % RequiredAlign) & 3)
6419 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6422 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6423 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6426 int EltNo = (Offset - StartOffset) >> 2;
6427 unsigned NumElems = VT.getVectorNumElements();
6429 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6430 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6431 LD->getPointerInfo().getWithOffset(StartOffset));
6433 SmallVector<int, 8> Mask(NumElems, EltNo);
6435 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6441 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6442 /// elements can be replaced by a single large load which has the same value as
6443 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6445 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6446 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6447 const SDLoc &DL, SelectionDAG &DAG,
6448 bool isAfterLegalize) {
6449 unsigned NumElems = Elts.size();
6451 int LastLoadedElt = -1;
6452 SmallBitVector LoadMask(NumElems, false);
6453 SmallBitVector ZeroMask(NumElems, false);
6454 SmallBitVector UndefMask(NumElems, false);
6456 // For each element in the initializer, see if we've found a load, zero or an
6458 for (unsigned i = 0; i < NumElems; ++i) {
6459 SDValue Elt = peekThroughBitcasts(Elts[i]);
6464 UndefMask[i] = true;
6465 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6467 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6470 // Each loaded element must be the correct fractional portion of the
6471 // requested vector load.
6472 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6477 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6478 "Incomplete element masks");
6480 // Handle Special Cases - all undef or undef/zero.
6481 if (UndefMask.count() == NumElems)
6482 return DAG.getUNDEF(VT);
6484 // FIXME: Should we return this as a BUILD_VECTOR instead?
6485 if ((ZeroMask | UndefMask).count() == NumElems)
6486 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6487 : DAG.getConstantFP(0.0, DL, VT);
6489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6490 int FirstLoadedElt = LoadMask.find_first();
6491 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6492 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6493 EVT LDBaseVT = EltBase.getValueType();
6495 // Consecutive loads can contain UNDEFS but not ZERO elements.
6496 // Consecutive loads with UNDEFs and ZEROs elements require a
6497 // an additional shuffle stage to clear the ZERO elements.
6498 bool IsConsecutiveLoad = true;
6499 bool IsConsecutiveLoadWithZeros = true;
6500 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6502 SDValue Elt = peekThroughBitcasts(Elts[i]);
6503 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6504 if (!DAG.areNonVolatileConsecutiveLoads(
6505 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6506 i - FirstLoadedElt)) {
6507 IsConsecutiveLoad = false;
6508 IsConsecutiveLoadWithZeros = false;
6511 } else if (ZeroMask[i]) {
6512 IsConsecutiveLoad = false;
6516 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6517 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6518 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6519 "Cannot merge volatile loads.");
6521 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6522 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6524 if (LDBase->hasAnyUseOfValue(1)) {
6526 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6527 SDValue(NewLd.getNode(), 1));
6528 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6529 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6530 SDValue(NewLd.getNode(), 1));
6536 // LOAD - all consecutive load/undefs (must start/end with a load).
6537 // If we have found an entire vector of loads and undefs, then return a large
6538 // load of the entire vector width starting at the base pointer.
6539 // If the vector contains zeros, then attempt to shuffle those elements.
6540 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6541 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6542 assert(LDBase && "Did not find base load for merging consecutive loads");
6543 EVT EltVT = LDBase->getValueType(0);
6544 // Ensure that the input vector size for the merged loads matches the
6545 // cumulative size of the input elements.
6546 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6549 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6552 if (IsConsecutiveLoad)
6553 return CreateLoad(VT, LDBase);
6555 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6556 // vector and a zero vector to clear out the zero elements.
6557 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6558 SmallVector<int, 4> ClearMask(NumElems, -1);
6559 for (unsigned i = 0; i < NumElems; ++i) {
6561 ClearMask[i] = i + NumElems;
6562 else if (LoadMask[i])
6565 SDValue V = CreateLoad(VT, LDBase);
6566 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6567 : DAG.getConstantFP(0.0, DL, VT);
6568 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6573 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6575 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6576 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6577 (LoadSize == 32 || LoadSize == 64) &&
6578 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6579 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6580 : MVT::getIntegerVT(LoadSize);
6581 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6582 if (TLI.isTypeLegal(VecVT)) {
6583 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6584 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6586 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6587 LDBase->getPointerInfo(),
6588 LDBase->getAlignment(),
6589 false/*isVolatile*/, true/*ReadMem*/,
6592 // Make sure the newly-created LOAD is in the same position as LDBase in
6593 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6594 // and update uses of LDBase's output chain to use the TokenFactor.
6595 if (LDBase->hasAnyUseOfValue(1)) {
6597 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6598 SDValue(ResNode.getNode(), 1));
6599 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6600 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6601 SDValue(ResNode.getNode(), 1));
6604 return DAG.getBitcast(VT, ResNode);
6611 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6612 unsigned SplatBitSize, LLVMContext &C) {
6613 unsigned ScalarSize = VT.getScalarSizeInBits();
6614 unsigned NumElm = SplatBitSize / ScalarSize;
6616 SmallVector<Constant *, 32> ConstantVec;
6617 for (unsigned i = 0; i < NumElm; i++) {
6618 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6620 if (VT.isFloatingPoint()) {
6621 assert((ScalarSize == 32 || ScalarSize == 64) &&
6622 "Unsupported floating point scalar size");
6623 if (ScalarSize == 32)
6624 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6626 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6628 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6629 ConstantVec.push_back(Const);
6631 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6634 static bool isUseOfShuffle(SDNode *N) {
6635 for (auto *U : N->uses()) {
6636 if (isTargetShuffle(U->getOpcode()))
6638 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6639 return isUseOfShuffle(U);
6644 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6645 /// following cases:
6646 /// 1. A splat BUILD_VECTOR which uses:
6647 /// a. A single scalar load, or a constant.
6648 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6649 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6650 /// a scalar load, or a constant.
6652 /// The VBROADCAST node is returned when a pattern is found,
6653 /// or SDValue() otherwise.
6654 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6655 SelectionDAG &DAG) {
6656 // VBROADCAST requires AVX.
6657 // TODO: Splats could be generated for non-AVX CPUs using SSE
6658 // instructions, but there's less potential gain for only 128-bit vectors.
6659 if (!Subtarget.hasAVX())
6662 MVT VT = BVOp->getSimpleValueType(0);
6665 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6666 "Unsupported vector type for broadcast.");
6668 BitVector UndefElements;
6669 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6671 // We need a splat of a single value to use broadcast, and it doesn't
6672 // make any sense if the value is only in one element of the vector.
6673 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6674 APInt SplatValue, Undef;
6675 unsigned SplatBitSize;
6677 // Check if this is a repeated constant pattern suitable for broadcasting.
6678 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6679 SplatBitSize > VT.getScalarSizeInBits() &&
6680 SplatBitSize < VT.getSizeInBits()) {
6681 // Avoid replacing with broadcast when it's a use of a shuffle
6682 // instruction to preserve the present custom lowering of shuffles.
6683 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6685 // replace BUILD_VECTOR with broadcast of the repeated constants.
6686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6687 LLVMContext *Ctx = DAG.getContext();
6688 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6689 if (Subtarget.hasAVX()) {
6690 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6691 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6692 // Splatted value can fit in one INTEGER constant in constant pool.
6693 // Load the constant and broadcast it.
6694 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6695 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6696 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6697 SDValue CP = DAG.getConstantPool(C, PVT);
6698 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6700 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6702 CVT, dl, DAG.getEntryNode(), CP,
6703 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6705 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6706 MVT::getVectorVT(CVT, Repeat), Ld);
6707 return DAG.getBitcast(VT, Brdcst);
6708 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6709 // Splatted value can fit in one FLOAT constant in constant pool.
6710 // Load the constant and broadcast it.
6711 // AVX have support for 32 and 64 bit broadcast for floats only.
6712 // No 64bit integer in 32bit subtarget.
6713 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6714 Constant *C = SplatBitSize == 32
6715 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6716 SplatValue.bitsToFloat())
6717 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6718 SplatValue.bitsToDouble());
6719 SDValue CP = DAG.getConstantPool(C, PVT);
6720 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6722 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6724 CVT, dl, DAG.getEntryNode(), CP,
6725 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6727 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6728 MVT::getVectorVT(CVT, Repeat), Ld);
6729 return DAG.getBitcast(VT, Brdcst);
6730 } else if (SplatBitSize > 64) {
6731 // Load the vector of constants and broadcast it.
6732 MVT CVT = VT.getScalarType();
6733 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6735 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6736 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6737 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6739 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6740 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6742 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6743 return DAG.getBitcast(VT, Brdcst);
6750 bool ConstSplatVal =
6751 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6753 // Make sure that all of the users of a non-constant load are from the
6754 // BUILD_VECTOR node.
6755 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6758 unsigned ScalarSize = Ld.getValueSizeInBits();
6759 bool IsGE256 = (VT.getSizeInBits() >= 256);
6761 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6762 // instruction to save 8 or more bytes of constant pool data.
6763 // TODO: If multiple splats are generated to load the same constant,
6764 // it may be detrimental to overall size. There needs to be a way to detect
6765 // that condition to know if this is truly a size win.
6766 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6768 // Handle broadcasting a single constant scalar from the constant pool
6770 // On Sandybridge (no AVX2), it is still better to load a constant vector
6771 // from the constant pool and not to broadcast it from a scalar.
6772 // But override that restriction when optimizing for size.
6773 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6774 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6775 EVT CVT = Ld.getValueType();
6776 assert(!CVT.isVector() && "Must not broadcast a vector type");
6778 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6779 // For size optimization, also splat v2f64 and v2i64, and for size opt
6780 // with AVX2, also splat i8 and i16.
6781 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6782 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6783 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6784 const Constant *C = nullptr;
6785 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6786 C = CI->getConstantIntValue();
6787 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6788 C = CF->getConstantFPValue();
6790 assert(C && "Invalid constant type");
6792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6794 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6795 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6797 CVT, dl, DAG.getEntryNode(), CP,
6798 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6801 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6805 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6807 // Handle AVX2 in-register broadcasts.
6808 if (!IsLoad && Subtarget.hasInt256() &&
6809 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6810 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6812 // The scalar source must be a normal load.
6816 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6817 (Subtarget.hasVLX() && ScalarSize == 64))
6818 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6820 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6821 // double since there is no vbroadcastsd xmm
6822 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6823 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6824 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6827 // Unsupported broadcast.
6831 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6832 /// underlying vector and index.
6834 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6836 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6838 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6839 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6842 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6844 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6846 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6847 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6850 // In this case the vector is the extract_subvector expression and the index
6851 // is 2, as specified by the shuffle.
6852 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6853 SDValue ShuffleVec = SVOp->getOperand(0);
6854 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6855 assert(ShuffleVecVT.getVectorElementType() ==
6856 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6858 int ShuffleIdx = SVOp->getMaskElt(Idx);
6859 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6860 ExtractedFromVec = ShuffleVec;
6866 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6867 MVT VT = Op.getSimpleValueType();
6869 // Skip if insert_vec_elt is not supported.
6870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6871 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6875 unsigned NumElems = Op.getNumOperands();
6879 SmallVector<unsigned, 4> InsertIndices;
6880 SmallVector<int, 8> Mask(NumElems, -1);
6882 for (unsigned i = 0; i != NumElems; ++i) {
6883 unsigned Opc = Op.getOperand(i).getOpcode();
6885 if (Opc == ISD::UNDEF)
6888 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6889 // Quit if more than 1 elements need inserting.
6890 if (InsertIndices.size() > 1)
6893 InsertIndices.push_back(i);
6897 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6898 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6900 // Quit if non-constant index.
6901 if (!isa<ConstantSDNode>(ExtIdx))
6903 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6905 // Quit if extracted from vector of different type.
6906 if (ExtractedFromVec.getValueType() != VT)
6909 if (!VecIn1.getNode())
6910 VecIn1 = ExtractedFromVec;
6911 else if (VecIn1 != ExtractedFromVec) {
6912 if (!VecIn2.getNode())
6913 VecIn2 = ExtractedFromVec;
6914 else if (VecIn2 != ExtractedFromVec)
6915 // Quit if more than 2 vectors to shuffle
6919 if (ExtractedFromVec == VecIn1)
6921 else if (ExtractedFromVec == VecIn2)
6922 Mask[i] = Idx + NumElems;
6925 if (!VecIn1.getNode())
6928 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6929 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6931 for (unsigned Idx : InsertIndices)
6932 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6933 DAG.getIntPtrConstant(Idx, DL));
6938 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6939 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6940 Op.getScalarValueSizeInBits() == 1 &&
6941 "Can not convert non-constant vector");
6942 uint64_t Immediate = 0;
6943 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6944 SDValue In = Op.getOperand(idx);
6946 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6949 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6950 return DAG.getConstant(Immediate, dl, VT);
6952 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6954 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6956 MVT VT = Op.getSimpleValueType();
6957 assert((VT.getVectorElementType() == MVT::i1) &&
6958 "Unexpected type in LowerBUILD_VECTORvXi1!");
6961 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6962 return DAG.getTargetConstant(0, dl, VT);
6964 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6965 return DAG.getTargetConstant(1, dl, VT);
6967 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6968 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6969 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6970 return DAG.getBitcast(VT, Imm);
6971 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6972 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6973 DAG.getIntPtrConstant(0, dl));
6976 // Vector has one or more non-const elements
6977 uint64_t Immediate = 0;
6978 SmallVector<unsigned, 16> NonConstIdx;
6979 bool IsSplat = true;
6980 bool HasConstElts = false;
6982 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6983 SDValue In = Op.getOperand(idx);
6986 if (!isa<ConstantSDNode>(In))
6987 NonConstIdx.push_back(idx);
6989 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6990 HasConstElts = true;
6994 else if (In != Op.getOperand(SplatIdx))
6998 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7000 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
7001 DAG.getConstant(1, dl, VT),
7002 DAG.getConstant(0, dl, VT));
7004 // insert elements one by one
7008 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7009 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7011 else if (HasConstElts)
7012 Imm = DAG.getConstant(0, dl, VT);
7014 Imm = DAG.getUNDEF(VT);
7015 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7016 DstVec = DAG.getBitcast(VT, Imm);
7018 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7019 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7020 DAG.getIntPtrConstant(0, dl));
7023 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7024 unsigned InsertIdx = NonConstIdx[i];
7025 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7026 Op.getOperand(InsertIdx),
7027 DAG.getIntPtrConstant(InsertIdx, dl));
7032 /// \brief Return true if \p N implements a horizontal binop and return the
7033 /// operands for the horizontal binop into V0 and V1.
7035 /// This is a helper function of LowerToHorizontalOp().
7036 /// This function checks that the build_vector \p N in input implements a
7037 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7038 /// operation to match.
7039 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7040 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7041 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7044 /// This function only analyzes elements of \p N whose indices are
7045 /// in range [BaseIdx, LastIdx).
7046 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7048 unsigned BaseIdx, unsigned LastIdx,
7049 SDValue &V0, SDValue &V1) {
7050 EVT VT = N->getValueType(0);
7052 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7053 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7054 "Invalid Vector in input!");
7056 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7057 bool CanFold = true;
7058 unsigned ExpectedVExtractIdx = BaseIdx;
7059 unsigned NumElts = LastIdx - BaseIdx;
7060 V0 = DAG.getUNDEF(VT);
7061 V1 = DAG.getUNDEF(VT);
7063 // Check if N implements a horizontal binop.
7064 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7065 SDValue Op = N->getOperand(i + BaseIdx);
7068 if (Op->isUndef()) {
7069 // Update the expected vector extract index.
7070 if (i * 2 == NumElts)
7071 ExpectedVExtractIdx = BaseIdx;
7072 ExpectedVExtractIdx += 2;
7076 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7081 SDValue Op0 = Op.getOperand(0);
7082 SDValue Op1 = Op.getOperand(1);
7084 // Try to match the following pattern:
7085 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7086 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7087 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7088 Op0.getOperand(0) == Op1.getOperand(0) &&
7089 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7090 isa<ConstantSDNode>(Op1.getOperand(1)));
7094 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7095 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7097 if (i * 2 < NumElts) {
7099 V0 = Op0.getOperand(0);
7100 if (V0.getValueType() != VT)
7105 V1 = Op0.getOperand(0);
7106 if (V1.getValueType() != VT)
7109 if (i * 2 == NumElts)
7110 ExpectedVExtractIdx = BaseIdx;
7113 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7114 if (I0 == ExpectedVExtractIdx)
7115 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7116 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7117 // Try to match the following dag sequence:
7118 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7119 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7123 ExpectedVExtractIdx += 2;
7129 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7130 /// a concat_vector.
7132 /// This is a helper function of LowerToHorizontalOp().
7133 /// This function expects two 256-bit vectors called V0 and V1.
7134 /// At first, each vector is split into two separate 128-bit vectors.
7135 /// Then, the resulting 128-bit vectors are used to implement two
7136 /// horizontal binary operations.
7138 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7140 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7141 /// the two new horizontal binop.
7142 /// When Mode is set, the first horizontal binop dag node would take as input
7143 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7144 /// horizontal binop dag node would take as input the lower 128-bit of V1
7145 /// and the upper 128-bit of V1.
7147 /// HADD V0_LO, V0_HI
7148 /// HADD V1_LO, V1_HI
7150 /// Otherwise, the first horizontal binop dag node takes as input the lower
7151 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7152 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7154 /// HADD V0_LO, V1_LO
7155 /// HADD V0_HI, V1_HI
7157 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7158 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7159 /// the upper 128-bits of the result.
7160 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7161 const SDLoc &DL, SelectionDAG &DAG,
7162 unsigned X86Opcode, bool Mode,
7163 bool isUndefLO, bool isUndefHI) {
7164 MVT VT = V0.getSimpleValueType();
7165 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7166 "Invalid nodes in input!");
7168 unsigned NumElts = VT.getVectorNumElements();
7169 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7170 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7171 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7172 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7173 MVT NewVT = V0_LO.getSimpleValueType();
7175 SDValue LO = DAG.getUNDEF(NewVT);
7176 SDValue HI = DAG.getUNDEF(NewVT);
7179 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7180 if (!isUndefLO && !V0->isUndef())
7181 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7182 if (!isUndefHI && !V1->isUndef())
7183 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7185 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7186 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7187 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7189 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7190 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7193 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7196 /// Returns true iff \p BV builds a vector with the result equivalent to
7197 /// the result of ADDSUB operation.
7198 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7199 /// are written to the parameters \p Opnd0 and \p Opnd1.
7200 static bool isAddSub(const BuildVectorSDNode *BV,
7201 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7202 SDValue &Opnd0, SDValue &Opnd1) {
7204 MVT VT = BV->getSimpleValueType(0);
7205 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7206 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7207 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7210 unsigned NumElts = VT.getVectorNumElements();
7211 SDValue InVec0 = DAG.getUNDEF(VT);
7212 SDValue InVec1 = DAG.getUNDEF(VT);
7214 // Odd-numbered elements in the input build vector are obtained from
7215 // adding two integer/float elements.
7216 // Even-numbered elements in the input build vector are obtained from
7217 // subtracting two integer/float elements.
7218 unsigned ExpectedOpcode = ISD::FSUB;
7219 unsigned NextExpectedOpcode = ISD::FADD;
7220 bool AddFound = false;
7221 bool SubFound = false;
7223 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7224 SDValue Op = BV->getOperand(i);
7226 // Skip 'undef' values.
7227 unsigned Opcode = Op.getOpcode();
7228 if (Opcode == ISD::UNDEF) {
7229 std::swap(ExpectedOpcode, NextExpectedOpcode);
7233 // Early exit if we found an unexpected opcode.
7234 if (Opcode != ExpectedOpcode)
7237 SDValue Op0 = Op.getOperand(0);
7238 SDValue Op1 = Op.getOperand(1);
7240 // Try to match the following pattern:
7241 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7242 // Early exit if we cannot match that sequence.
7243 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7244 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7245 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7246 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7247 Op0.getOperand(1) != Op1.getOperand(1))
7250 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7254 // We found a valid add/sub node. Update the information accordingly.
7260 // Update InVec0 and InVec1.
7261 if (InVec0.isUndef()) {
7262 InVec0 = Op0.getOperand(0);
7263 if (InVec0.getSimpleValueType() != VT)
7266 if (InVec1.isUndef()) {
7267 InVec1 = Op1.getOperand(0);
7268 if (InVec1.getSimpleValueType() != VT)
7272 // Make sure that operands in input to each add/sub node always
7273 // come from a same pair of vectors.
7274 if (InVec0 != Op0.getOperand(0)) {
7275 if (ExpectedOpcode == ISD::FSUB)
7278 // FADD is commutable. Try to commute the operands
7279 // and then test again.
7280 std::swap(Op0, Op1);
7281 if (InVec0 != Op0.getOperand(0))
7285 if (InVec1 != Op1.getOperand(0))
7288 // Update the pair of expected opcodes.
7289 std::swap(ExpectedOpcode, NextExpectedOpcode);
7292 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7293 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7301 /// Returns true if is possible to fold MUL and an idiom that has already been
7302 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7303 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7304 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7306 /// Prior to calling this function it should be known that there is some
7307 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7308 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7309 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7310 /// of \p Opnd0 uses is expected to be equal to 2.
7311 /// For example, this function may be called for the following IR:
7312 /// %AB = fmul fast <2 x double> %A, %B
7313 /// %Sub = fsub fast <2 x double> %AB, %C
7314 /// %Add = fadd fast <2 x double> %AB, %C
7315 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7316 /// <2 x i32> <i32 0, i32 3>
7317 /// There is a def for %Addsub here, which potentially can be replaced by
7318 /// X86ISD::ADDSUB operation:
7319 /// %Addsub = X86ISD::ADDSUB %AB, %C
7320 /// and such ADDSUB can further be replaced with FMADDSUB:
7321 /// %Addsub = FMADDSUB %A, %B, %C.
7323 /// The main reason why this method is called before the replacement of the
7324 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7325 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7327 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7328 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7329 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7330 !Subtarget.hasAnyFMA())
7333 // FIXME: These checks must match the similar ones in
7334 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7335 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7336 // or MUL + ADDSUB to FMADDSUB.
7337 const TargetOptions &Options = DAG.getTarget().Options;
7339 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7344 Opnd1 = Opnd0.getOperand(1);
7345 Opnd0 = Opnd0.getOperand(0);
7350 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7351 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7352 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7353 const X86Subtarget &Subtarget,
7354 SelectionDAG &DAG) {
7355 SDValue Opnd0, Opnd1;
7356 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7359 MVT VT = BV->getSimpleValueType(0);
7362 // Try to generate X86ISD::FMADDSUB node here.
7364 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7365 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7367 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7368 // the ADDSUB idiom has been successfully recognized. There are no known
7369 // X86 targets with 512-bit ADDSUB instructions!
7370 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7372 if (VT.is512BitVector())
7375 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7378 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7379 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7380 const X86Subtarget &Subtarget,
7381 SelectionDAG &DAG) {
7382 MVT VT = BV->getSimpleValueType(0);
7383 unsigned NumElts = VT.getVectorNumElements();
7384 unsigned NumUndefsLO = 0;
7385 unsigned NumUndefsHI = 0;
7386 unsigned Half = NumElts/2;
7388 // Count the number of UNDEF operands in the build_vector in input.
7389 for (unsigned i = 0, e = Half; i != e; ++i)
7390 if (BV->getOperand(i)->isUndef())
7393 for (unsigned i = Half, e = NumElts; i != e; ++i)
7394 if (BV->getOperand(i)->isUndef())
7397 // Early exit if this is either a build_vector of all UNDEFs or all the
7398 // operands but one are UNDEF.
7399 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7403 SDValue InVec0, InVec1;
7404 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7405 // Try to match an SSE3 float HADD/HSUB.
7406 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7407 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7409 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7410 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7411 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7412 // Try to match an SSSE3 integer HADD/HSUB.
7413 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7414 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7416 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7417 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7420 if (!Subtarget.hasAVX())
7423 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7424 // Try to match an AVX horizontal add/sub of packed single/double
7425 // precision floating point values from 256-bit vectors.
7426 SDValue InVec2, InVec3;
7427 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7428 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7429 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7430 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7431 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7433 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7434 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7435 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7436 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7437 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7438 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7439 // Try to match an AVX2 horizontal add/sub of signed integers.
7440 SDValue InVec2, InVec3;
7442 bool CanFold = true;
7444 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7445 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7446 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7447 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7448 X86Opcode = X86ISD::HADD;
7449 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7450 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7451 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7452 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7453 X86Opcode = X86ISD::HSUB;
7458 // Fold this build_vector into a single horizontal add/sub.
7459 // Do this only if the target has AVX2.
7460 if (Subtarget.hasAVX2())
7461 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7463 // Do not try to expand this build_vector into a pair of horizontal
7464 // add/sub if we can emit a pair of scalar add/sub.
7465 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7468 // Convert this build_vector into a pair of horizontal binop followed by
7470 bool isUndefLO = NumUndefsLO == Half;
7471 bool isUndefHI = NumUndefsHI == Half;
7472 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7473 isUndefLO, isUndefHI);
7477 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7478 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7480 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7481 X86Opcode = X86ISD::HADD;
7482 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7483 X86Opcode = X86ISD::HSUB;
7484 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7485 X86Opcode = X86ISD::FHADD;
7486 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7487 X86Opcode = X86ISD::FHSUB;
7491 // Don't try to expand this build_vector into a pair of horizontal add/sub
7492 // if we can simply emit a pair of scalar add/sub.
7493 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7496 // Convert this build_vector into two horizontal add/sub followed by
7498 bool isUndefLO = NumUndefsLO == Half;
7499 bool isUndefHI = NumUndefsHI == Half;
7500 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7501 isUndefLO, isUndefHI);
7507 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7508 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7509 /// just apply the bit to the vectors.
7510 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7511 /// from this, but enough scalar bit operations are created from the later
7512 /// legalization + scalarization stages to need basic support.
7513 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7514 SelectionDAG &DAG) {
7516 MVT VT = Op->getSimpleValueType(0);
7517 unsigned NumElems = VT.getVectorNumElements();
7518 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7520 // Check that all elements have the same opcode.
7521 // TODO: Should we allow UNDEFS and if so how many?
7522 unsigned Opcode = Op->getOperand(0).getOpcode();
7523 for (unsigned i = 1; i < NumElems; ++i)
7524 if (Opcode != Op->getOperand(i).getOpcode())
7527 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7534 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7539 SmallVector<SDValue, 4> LHSElts, RHSElts;
7540 for (SDValue Elt : Op->ops()) {
7541 SDValue LHS = Elt.getOperand(0);
7542 SDValue RHS = Elt.getOperand(1);
7544 // We expect the canonicalized RHS operand to be the constant.
7545 if (!isa<ConstantSDNode>(RHS))
7547 LHSElts.push_back(LHS);
7548 RHSElts.push_back(RHS);
7551 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7552 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7553 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7556 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7557 /// functionality to do this, so it's all zeros, all ones, or some derivation
7558 /// that is cheap to calculate.
7559 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7560 const X86Subtarget &Subtarget) {
7562 MVT VT = Op.getSimpleValueType();
7564 // Vectors containing all zeros can be matched by pxor and xorps.
7565 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7566 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7567 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7568 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7571 return getZeroVector(VT, Subtarget, DAG, DL);
7574 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7575 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7576 // vpcmpeqd on 256-bit vectors.
7577 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7578 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7579 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7582 return getOnesVector(VT, DAG, DL);
7589 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7592 MVT VT = Op.getSimpleValueType();
7593 MVT ExtVT = VT.getVectorElementType();
7594 unsigned NumElems = Op.getNumOperands();
7596 // Generate vectors for predicate vectors.
7597 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7598 return LowerBUILD_VECTORvXi1(Op, DAG);
7600 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7601 return VectorConstant;
7603 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7604 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7606 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7607 return HorizontalOp;
7608 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7610 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7613 unsigned EVTBits = ExtVT.getSizeInBits();
7615 unsigned NumZero = 0;
7616 unsigned NumNonZero = 0;
7617 uint64_t NonZeros = 0;
7618 bool IsAllConstants = true;
7619 SmallSet<SDValue, 8> Values;
7620 for (unsigned i = 0; i < NumElems; ++i) {
7621 SDValue Elt = Op.getOperand(i);
7625 if (Elt.getOpcode() != ISD::Constant &&
7626 Elt.getOpcode() != ISD::ConstantFP)
7627 IsAllConstants = false;
7628 if (X86::isZeroNode(Elt))
7631 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7632 NonZeros |= ((uint64_t)1 << i);
7637 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7638 if (NumNonZero == 0)
7639 return DAG.getUNDEF(VT);
7641 // Special case for single non-zero, non-undef, element.
7642 if (NumNonZero == 1) {
7643 unsigned Idx = countTrailingZeros(NonZeros);
7644 SDValue Item = Op.getOperand(Idx);
7646 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7647 // the value are obviously zero, truncate the value to i32 and do the
7648 // insertion that way. Only do this if the value is non-constant or if the
7649 // value is a constant being inserted into element 0. It is cheaper to do
7650 // a constant pool load than it is to do a movd + shuffle.
7651 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7652 (!IsAllConstants || Idx == 0)) {
7653 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7655 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7656 MVT VecVT = MVT::v4i32;
7658 // Truncate the value (which may itself be a constant) to i32, and
7659 // convert it to a vector with movd (S2V+shuffle to zero extend).
7660 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7661 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7662 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7663 Item, Idx * 2, true, Subtarget, DAG));
7667 // If we have a constant or non-constant insertion into the low element of
7668 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7669 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7670 // depending on what the source datatype is.
7673 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7675 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7676 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7677 assert((VT.is128BitVector() || VT.is256BitVector() ||
7678 VT.is512BitVector()) &&
7679 "Expected an SSE value type!");
7680 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7681 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7682 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7685 // We can't directly insert an i8 or i16 into a vector, so zero extend
7687 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7688 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7689 if (VT.getSizeInBits() >= 256) {
7690 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7691 if (Subtarget.hasAVX()) {
7692 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7693 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7695 // Without AVX, we need to extend to a 128-bit vector and then
7696 // insert into the 256-bit vector.
7697 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7698 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7699 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7702 assert(VT.is128BitVector() && "Expected an SSE value type!");
7703 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7704 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7706 return DAG.getBitcast(VT, Item);
7710 // Is it a vector logical left shift?
7711 if (NumElems == 2 && Idx == 1 &&
7712 X86::isZeroNode(Op.getOperand(0)) &&
7713 !X86::isZeroNode(Op.getOperand(1))) {
7714 unsigned NumBits = VT.getSizeInBits();
7715 return getVShift(true, VT,
7716 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7717 VT, Op.getOperand(1)),
7718 NumBits/2, DAG, *this, dl);
7721 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7724 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7725 // is a non-constant being inserted into an element other than the low one,
7726 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7727 // movd/movss) to move this into the low element, then shuffle it into
7729 if (EVTBits == 32) {
7730 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7731 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7735 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7736 if (Values.size() == 1) {
7737 if (EVTBits == 32) {
7738 // Instead of a shuffle like this:
7739 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7740 // Check if it's possible to issue this instead.
7741 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7742 unsigned Idx = countTrailingZeros(NonZeros);
7743 SDValue Item = Op.getOperand(Idx);
7744 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7745 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7750 // A vector full of immediates; various special cases are already
7751 // handled, so this is best done with a single constant-pool load.
7755 // See if we can use a vector load to get all of the elements.
7756 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7757 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7758 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7762 // For AVX-length vectors, build the individual 128-bit pieces and use
7763 // shuffles to put them in place.
7764 if (VT.is256BitVector() || VT.is512BitVector()) {
7765 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7767 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7769 // Build both the lower and upper subvector.
7771 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7772 SDValue Upper = DAG.getBuildVector(
7773 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7775 // Recreate the wider vector with the lower and upper part.
7776 if (VT.is256BitVector())
7777 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7778 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7781 // Let legalizer expand 2-wide build_vectors.
7782 if (EVTBits == 64) {
7783 if (NumNonZero == 1) {
7784 // One half is zero or undef.
7785 unsigned Idx = countTrailingZeros(NonZeros);
7786 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7787 Op.getOperand(Idx));
7788 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7793 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7794 if (EVTBits == 8 && NumElems == 16)
7795 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7799 if (EVTBits == 16 && NumElems == 8)
7800 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7804 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7805 if (EVTBits == 32 && NumElems == 4)
7806 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7809 // If element VT is == 32 bits, turn it into a number of shuffles.
7810 if (NumElems == 4 && NumZero > 0) {
7811 SmallVector<SDValue, 8> Ops(NumElems);
7812 for (unsigned i = 0; i < 4; ++i) {
7813 bool isZero = !(NonZeros & (1ULL << i));
7815 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7817 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7820 for (unsigned i = 0; i < 2; ++i) {
7821 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7824 Ops[i] = Ops[i*2]; // Must be a zero vector.
7827 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7830 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7833 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7838 bool Reverse1 = (NonZeros & 0x3) == 2;
7839 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7843 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7844 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7846 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7849 if (Values.size() > 1 && VT.is128BitVector()) {
7850 // Check for a build vector from mostly shuffle plus few inserting.
7851 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7854 // For SSE 4.1, use insertps to put the high elements into the low element.
7855 if (Subtarget.hasSSE41()) {
7857 if (!Op.getOperand(0).isUndef())
7858 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7860 Result = DAG.getUNDEF(VT);
7862 for (unsigned i = 1; i < NumElems; ++i) {
7863 if (Op.getOperand(i).isUndef()) continue;
7864 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7865 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7870 // Otherwise, expand into a number of unpckl*, start by extending each of
7871 // our (non-undef) elements to the full vector width with the element in the
7872 // bottom slot of the vector (which generates no code for SSE).
7873 SmallVector<SDValue, 8> Ops(NumElems);
7874 for (unsigned i = 0; i < NumElems; ++i) {
7875 if (!Op.getOperand(i).isUndef())
7876 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7878 Ops[i] = DAG.getUNDEF(VT);
7881 // Next, we iteratively mix elements, e.g. for v4f32:
7882 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7883 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7884 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7885 unsigned EltStride = NumElems >> 1;
7886 while (EltStride != 0) {
7887 for (unsigned i = 0; i < EltStride; ++i) {
7888 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7889 // then it is safe to just drop this shuffle: V[i] is already in the
7890 // right place, the one element (since it's the first round) being
7891 // inserted as undef can be dropped. This isn't safe for successive
7892 // rounds because they will permute elements within both vectors.
7893 if (Ops[i+EltStride].isUndef() &&
7894 EltStride == NumElems/2)
7897 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7906 // 256-bit AVX can use the vinsertf128 instruction
7907 // to create 256-bit vectors from two other 128-bit ones.
7908 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7910 MVT ResVT = Op.getSimpleValueType();
7912 assert((ResVT.is256BitVector() ||
7913 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7915 SDValue V1 = Op.getOperand(0);
7916 SDValue V2 = Op.getOperand(1);
7917 unsigned NumElems = ResVT.getVectorNumElements();
7918 if (ResVT.is256BitVector())
7919 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7921 if (Op.getNumOperands() == 4) {
7922 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7923 ResVT.getVectorNumElements()/2);
7924 SDValue V3 = Op.getOperand(2);
7925 SDValue V4 = Op.getOperand(3);
7926 return concat256BitVectors(
7927 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7928 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7931 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7934 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7935 const X86Subtarget &Subtarget,
7936 SelectionDAG & DAG) {
7938 MVT ResVT = Op.getSimpleValueType();
7939 unsigned NumOfOperands = Op.getNumOperands();
7941 assert(isPowerOf2_32(NumOfOperands) &&
7942 "Unexpected number of operands in CONCAT_VECTORS");
7944 SDValue Undef = DAG.getUNDEF(ResVT);
7945 if (NumOfOperands > 2) {
7946 // Specialize the cases when all, or all but one, of the operands are undef.
7947 unsigned NumOfDefinedOps = 0;
7949 for (unsigned i = 0; i < NumOfOperands; i++)
7950 if (!Op.getOperand(i).isUndef()) {
7954 if (NumOfDefinedOps == 0)
7956 if (NumOfDefinedOps == 1) {
7957 unsigned SubVecNumElts =
7958 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7959 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7960 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7961 Op.getOperand(OpIdx), IdxVal);
7964 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7965 ResVT.getVectorNumElements()/2);
7966 SmallVector<SDValue, 2> Ops;
7967 for (unsigned i = 0; i < NumOfOperands/2; i++)
7968 Ops.push_back(Op.getOperand(i));
7969 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7971 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7972 Ops.push_back(Op.getOperand(i));
7973 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7974 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7978 SDValue V1 = Op.getOperand(0);
7979 SDValue V2 = Op.getOperand(1);
7980 unsigned NumElems = ResVT.getVectorNumElements();
7981 assert(V1.getValueType() == V2.getValueType() &&
7982 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7983 "Unexpected operands in CONCAT_VECTORS");
7985 if (ResVT.getSizeInBits() >= 16)
7986 return Op; // The operation is legal with KUNPCK
7988 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7989 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7990 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7991 if (IsZeroV1 && IsZeroV2)
7994 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7996 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7998 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8000 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8002 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8005 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8007 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8008 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8011 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8012 const X86Subtarget &Subtarget,
8013 SelectionDAG &DAG) {
8014 MVT VT = Op.getSimpleValueType();
8015 if (VT.getVectorElementType() == MVT::i1)
8016 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8018 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8019 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8020 Op.getNumOperands() == 4)));
8022 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8023 // from two other 128-bit ones.
8025 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8026 return LowerAVXCONCAT_VECTORS(Op, DAG);
8029 //===----------------------------------------------------------------------===//
8030 // Vector shuffle lowering
8032 // This is an experimental code path for lowering vector shuffles on x86. It is
8033 // designed to handle arbitrary vector shuffles and blends, gracefully
8034 // degrading performance as necessary. It works hard to recognize idiomatic
8035 // shuffles and lower them to optimal instruction patterns without leaving
8036 // a framework that allows reasonably efficient handling of all vector shuffle
8038 //===----------------------------------------------------------------------===//
8040 /// \brief Tiny helper function to identify a no-op mask.
8042 /// This is a somewhat boring predicate function. It checks whether the mask
8043 /// array input, which is assumed to be a single-input shuffle mask of the kind
8044 /// used by the X86 shuffle instructions (not a fully general
8045 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8046 /// in-place shuffle are 'no-op's.
8047 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8048 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8049 assert(Mask[i] >= -1 && "Out of bound mask element!");
8050 if (Mask[i] >= 0 && Mask[i] != i)
8056 /// \brief Test whether there are elements crossing 128-bit lanes in this
8059 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8060 /// and we routinely test for these.
8061 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8062 int LaneSize = 128 / VT.getScalarSizeInBits();
8063 int Size = Mask.size();
8064 for (int i = 0; i < Size; ++i)
8065 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8070 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8072 /// This checks a shuffle mask to see if it is performing the same
8073 /// lane-relative shuffle in each sub-lane. This trivially implies
8074 /// that it is also not lane-crossing. It may however involve a blend from the
8075 /// same lane of a second vector.
8077 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8078 /// non-trivial to compute in the face of undef lanes. The representation is
8079 /// suitable for use with existing 128-bit shuffles as entries from the second
8080 /// vector have been remapped to [LaneSize, 2*LaneSize).
8081 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8083 SmallVectorImpl<int> &RepeatedMask) {
8084 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8085 RepeatedMask.assign(LaneSize, -1);
8086 int Size = Mask.size();
8087 for (int i = 0; i < Size; ++i) {
8088 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8091 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8092 // This entry crosses lanes, so there is no way to model this shuffle.
8095 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8096 // Adjust second vector indices to start at LaneSize instead of Size.
8097 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8098 : Mask[i] % LaneSize + LaneSize;
8099 if (RepeatedMask[i % LaneSize] < 0)
8100 // This is the first non-undef entry in this slot of a 128-bit lane.
8101 RepeatedMask[i % LaneSize] = LocalM;
8102 else if (RepeatedMask[i % LaneSize] != LocalM)
8103 // Found a mismatch with the repeated mask.
8109 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8111 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8112 SmallVectorImpl<int> &RepeatedMask) {
8113 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8116 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8118 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8119 SmallVectorImpl<int> &RepeatedMask) {
8120 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8123 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8124 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8125 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8127 SmallVectorImpl<int> &RepeatedMask) {
8128 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8129 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8130 int Size = Mask.size();
8131 for (int i = 0; i < Size; ++i) {
8132 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8133 if (Mask[i] == SM_SentinelUndef)
8135 if (Mask[i] == SM_SentinelZero) {
8136 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8138 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8141 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8142 // This entry crosses lanes, so there is no way to model this shuffle.
8145 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8146 // Adjust second vector indices to start at LaneSize instead of Size.
8148 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8149 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8150 // This is the first non-undef entry in this slot of a 128-bit lane.
8151 RepeatedMask[i % LaneSize] = LocalM;
8152 else if (RepeatedMask[i % LaneSize] != LocalM)
8153 // Found a mismatch with the repeated mask.
8159 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8162 /// This is a fast way to test a shuffle mask against a fixed pattern:
8164 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8166 /// It returns true if the mask is exactly as wide as the argument list, and
8167 /// each element of the mask is either -1 (signifying undef) or the value given
8168 /// in the argument.
8169 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8170 ArrayRef<int> ExpectedMask) {
8171 if (Mask.size() != ExpectedMask.size())
8174 int Size = Mask.size();
8176 // If the values are build vectors, we can look through them to find
8177 // equivalent inputs that make the shuffles equivalent.
8178 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8179 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8181 for (int i = 0; i < Size; ++i) {
8182 assert(Mask[i] >= -1 && "Out of bound mask element!");
8183 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8184 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8185 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8186 if (!MaskBV || !ExpectedBV ||
8187 MaskBV->getOperand(Mask[i] % Size) !=
8188 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8196 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8198 /// The masks must be exactly the same width.
8200 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8201 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8203 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8204 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8205 ArrayRef<int> ExpectedMask) {
8206 int Size = Mask.size();
8207 if (Size != (int)ExpectedMask.size())
8210 for (int i = 0; i < Size; ++i)
8211 if (Mask[i] == SM_SentinelUndef)
8213 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8215 else if (Mask[i] != ExpectedMask[i])
8221 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8223 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8224 const APInt &Zeroable) {
8225 int NumElts = Mask.size();
8226 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8228 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8229 for (int i = 0; i != NumElts; ++i) {
8231 if (M == SM_SentinelUndef)
8233 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8234 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8239 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8241 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8242 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8245 SmallVector<int, 8> Unpcklwd;
8246 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8247 /* Unary = */ false);
8248 SmallVector<int, 8> Unpckhwd;
8249 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8250 /* Unary = */ false);
8251 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8252 isTargetShuffleEquivalent(Mask, Unpckhwd));
8253 return IsUnpackwdMask;
8256 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8258 /// This helper function produces an 8-bit shuffle immediate corresponding to
8259 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8260 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8263 /// NB: We rely heavily on "undef" masks preserving the input lane.
8264 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8265 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8266 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8267 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8268 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8269 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8272 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8273 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8274 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8275 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8279 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8280 SelectionDAG &DAG) {
8281 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8284 /// \brief Compute whether each element of a shuffle is zeroable.
8286 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8287 /// Either it is an undef element in the shuffle mask, the element of the input
8288 /// referenced is undef, or the element of the input referenced is known to be
8289 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8290 /// as many lanes with this technique as possible to simplify the remaining
8292 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8293 SDValue V1, SDValue V2) {
8294 APInt Zeroable(Mask.size(), 0);
8295 V1 = peekThroughBitcasts(V1);
8296 V2 = peekThroughBitcasts(V2);
8298 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8299 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8301 int VectorSizeInBits = V1.getValueSizeInBits();
8302 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8303 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8305 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8307 // Handle the easy cases.
8308 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8313 // Determine shuffle input and normalize the mask.
8314 SDValue V = M < Size ? V1 : V2;
8317 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8318 if (V.getOpcode() != ISD::BUILD_VECTOR)
8321 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8322 // the (larger) source element must be UNDEF/ZERO.
8323 if ((Size % V.getNumOperands()) == 0) {
8324 int Scale = Size / V->getNumOperands();
8325 SDValue Op = V.getOperand(M / Scale);
8326 if (Op.isUndef() || X86::isZeroNode(Op))
8328 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8329 APInt Val = Cst->getAPIntValue();
8330 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8331 Val = Val.getLoBits(ScalarSizeInBits);
8334 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8335 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8336 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8337 Val = Val.getLoBits(ScalarSizeInBits);
8344 // If the BUILD_VECTOR has more elements then all the (smaller) source
8345 // elements must be UNDEF or ZERO.
8346 if ((V.getNumOperands() % Size) == 0) {
8347 int Scale = V->getNumOperands() / Size;
8348 bool AllZeroable = true;
8349 for (int j = 0; j < Scale; ++j) {
8350 SDValue Op = V.getOperand((M * Scale) + j);
8351 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8362 // The Shuffle result is as follow:
8363 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8364 // Each Zeroable's element correspond to a particular Mask's element.
8365 // As described in computeZeroableShuffleElements function.
8367 // The function looks for a sub-mask that the nonzero elements are in
8368 // increasing order. If such sub-mask exist. The function returns true.
8369 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8370 ArrayRef<int> Mask, const EVT &VectorType,
8371 bool &IsZeroSideLeft) {
8372 int NextElement = -1;
8373 // Check if the Mask's nonzero elements are in increasing order.
8374 for (int i = 0, e = Mask.size(); i < e; i++) {
8375 // Checks if the mask's zeros elements are built from only zeros.
8376 assert(Mask[i] >= -1 && "Out of bound mask element!");
8381 // Find the lowest non zero element
8382 if (NextElement < 0) {
8383 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8384 IsZeroSideLeft = NextElement != 0;
8386 // Exit if the mask's non zero elements are not in increasing order.
8387 if (NextElement != Mask[i])
8394 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8395 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8396 ArrayRef<int> Mask, SDValue V1,
8398 const APInt &Zeroable,
8399 const X86Subtarget &Subtarget,
8400 SelectionDAG &DAG) {
8401 int Size = Mask.size();
8402 int LaneSize = 128 / VT.getScalarSizeInBits();
8403 const int NumBytes = VT.getSizeInBits() / 8;
8404 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8406 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8407 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8408 (Subtarget.hasBWI() && VT.is512BitVector()));
8410 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8411 // Sign bit set in i8 mask means zero element.
8412 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8415 for (int i = 0; i < NumBytes; ++i) {
8416 int M = Mask[i / NumEltBytes];
8418 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8421 if (Zeroable[i / NumEltBytes]) {
8422 PSHUFBMask[i] = ZeroMask;
8426 // We can only use a single input of V1 or V2.
8427 SDValue SrcV = (M >= Size ? V2 : V1);
8433 // PSHUFB can't cross lanes, ensure this doesn't happen.
8434 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8438 M = M * NumEltBytes + (i % NumEltBytes);
8439 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8441 assert(V && "Failed to find a source input");
8443 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8444 return DAG.getBitcast(
8445 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8446 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8449 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8450 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8453 // X86 has dedicated shuffle that can be lowered to VEXPAND
8454 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8455 const APInt &Zeroable,
8456 ArrayRef<int> Mask, SDValue &V1,
8457 SDValue &V2, SelectionDAG &DAG,
8458 const X86Subtarget &Subtarget) {
8459 bool IsLeftZeroSide = true;
8460 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8463 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8465 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8466 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8467 unsigned NumElts = VT.getVectorNumElements();
8468 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8469 "Unexpected number of vector elements");
8470 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8471 Subtarget, DAG, DL);
8472 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8473 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8474 return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
8475 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8479 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8480 unsigned &UnpackOpcode, bool IsUnary,
8481 ArrayRef<int> TargetMask, SDLoc &DL,
8483 const X86Subtarget &Subtarget) {
8484 int NumElts = VT.getVectorNumElements();
8486 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8487 for (int i = 0; i != NumElts; i += 2) {
8488 int M1 = TargetMask[i + 0];
8489 int M2 = TargetMask[i + 1];
8490 Undef1 &= (SM_SentinelUndef == M1);
8491 Undef2 &= (SM_SentinelUndef == M2);
8492 Zero1 &= isUndefOrZero(M1);
8493 Zero2 &= isUndefOrZero(M2);
8495 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8496 "Zeroable shuffle detected");
8498 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8499 SmallVector<int, 64> Unpckl, Unpckh;
8500 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8501 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8502 UnpackOpcode = X86ISD::UNPCKL;
8503 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8504 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8508 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8509 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8510 UnpackOpcode = X86ISD::UNPCKH;
8511 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8512 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8516 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8517 if (IsUnary && (Zero1 || Zero2)) {
8518 // Don't bother if we can blend instead.
8519 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8520 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8523 bool MatchLo = true, MatchHi = true;
8524 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8525 int M = TargetMask[i];
8527 // Ignore if the input is known to be zero or the index is undef.
8528 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8529 (M == SM_SentinelUndef))
8532 MatchLo &= (M == Unpckl[i]);
8533 MatchHi &= (M == Unpckh[i]);
8536 if (MatchLo || MatchHi) {
8537 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8538 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8539 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8544 // If a binary shuffle, commute and try again.
8546 ShuffleVectorSDNode::commuteMask(Unpckl);
8547 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8548 UnpackOpcode = X86ISD::UNPCKL;
8553 ShuffleVectorSDNode::commuteMask(Unpckh);
8554 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8555 UnpackOpcode = X86ISD::UNPCKH;
8564 // X86 has dedicated unpack instructions that can handle specific blend
8565 // operations: UNPCKH and UNPCKL.
8566 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8567 ArrayRef<int> Mask, SDValue V1,
8568 SDValue V2, SelectionDAG &DAG) {
8569 SmallVector<int, 8> Unpckl;
8570 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8571 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8572 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8574 SmallVector<int, 8> Unpckh;
8575 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8576 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8577 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8579 // Commute and try again.
8580 ShuffleVectorSDNode::commuteMask(Unpckl);
8581 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8582 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8584 ShuffleVectorSDNode::commuteMask(Unpckh);
8585 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8586 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8591 /// \brief Try to emit a bitmask instruction for a shuffle.
8593 /// This handles cases where we can model a blend exactly as a bitmask due to
8594 /// one of the inputs being zeroable.
8595 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8596 SDValue V2, ArrayRef<int> Mask,
8597 const APInt &Zeroable,
8598 SelectionDAG &DAG) {
8599 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8600 MVT EltVT = VT.getVectorElementType();
8601 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8602 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8603 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8605 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8608 if (Mask[i] % Size != i)
8609 return SDValue(); // Not a blend.
8611 V = Mask[i] < Size ? V1 : V2;
8612 else if (V != (Mask[i] < Size ? V1 : V2))
8613 return SDValue(); // Can only let one input through the mask.
8615 VMaskOps[i] = AllOnes;
8618 return SDValue(); // No non-zeroable elements!
8620 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8621 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8624 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8626 /// This is used as a fallback approach when first class blend instructions are
8627 /// unavailable. Currently it is only suitable for integer vectors, but could
8628 /// be generalized for floating point vectors if desirable.
8629 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8630 SDValue V2, ArrayRef<int> Mask,
8631 SelectionDAG &DAG) {
8632 assert(VT.isInteger() && "Only supports integer vector types!");
8633 MVT EltVT = VT.getVectorElementType();
8634 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8635 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8636 SmallVector<SDValue, 16> MaskOps;
8637 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8638 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8639 return SDValue(); // Shuffled input!
8640 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8643 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8644 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8645 // We have to cast V2 around.
8646 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8647 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8648 DAG.getBitcast(MaskVT, V1Mask),
8649 DAG.getBitcast(MaskVT, V2)));
8650 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8653 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8654 SDValue PreservedSrc,
8655 const X86Subtarget &Subtarget,
8658 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8659 MutableArrayRef<int> TargetMask,
8660 bool &ForceV1Zero, bool &ForceV2Zero,
8661 uint64_t &BlendMask) {
8662 bool V1IsZeroOrUndef =
8663 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8664 bool V2IsZeroOrUndef =
8665 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8668 ForceV1Zero = false, ForceV2Zero = false;
8669 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8671 // Attempt to generate the binary blend mask. If an input is zero then
8672 // we can use any lane.
8673 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8674 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8675 int M = TargetMask[i];
8676 if (M == SM_SentinelUndef)
8680 if (M == i + Size) {
8681 BlendMask |= 1ull << i;
8684 if (M == SM_SentinelZero) {
8685 if (V1IsZeroOrUndef) {
8690 if (V2IsZeroOrUndef) {
8692 BlendMask |= 1ull << i;
8693 TargetMask[i] = i + Size;
8702 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8703 uint64_t ScaledMask = 0;
8704 for (int i = 0; i != Size; ++i)
8705 if (BlendMask & (1ull << i))
8706 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8710 /// \brief Try to emit a blend instruction for a shuffle.
8712 /// This doesn't do any checks for the availability of instructions for blending
8713 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8714 /// be matched in the backend with the type given. What it does check for is
8715 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8716 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8717 SDValue V2, ArrayRef<int> Original,
8718 const APInt &Zeroable,
8719 const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG) {
8721 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8723 uint64_t BlendMask = 0;
8724 bool ForceV1Zero = false, ForceV2Zero = false;
8725 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8729 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8731 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8733 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8735 switch (VT.SimpleTy) {
8740 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8741 DAG.getConstant(BlendMask, DL, MVT::i8));
8745 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8749 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8750 // that instruction.
8751 if (Subtarget.hasAVX2()) {
8752 // Scale the blend by the number of 32-bit dwords per element.
8753 int Scale = VT.getScalarSizeInBits() / 32;
8754 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8755 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8756 V1 = DAG.getBitcast(BlendVT, V1);
8757 V2 = DAG.getBitcast(BlendVT, V2);
8758 return DAG.getBitcast(
8759 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8760 DAG.getConstant(BlendMask, DL, MVT::i8)));
8764 // For integer shuffles we need to expand the mask and cast the inputs to
8765 // v8i16s prior to blending.
8766 int Scale = 8 / VT.getVectorNumElements();
8767 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8768 V1 = DAG.getBitcast(MVT::v8i16, V1);
8769 V2 = DAG.getBitcast(MVT::v8i16, V2);
8770 return DAG.getBitcast(VT,
8771 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8772 DAG.getConstant(BlendMask, DL, MVT::i8)));
8776 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8777 SmallVector<int, 8> RepeatedMask;
8778 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8779 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8780 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8782 for (int i = 0; i < 8; ++i)
8783 if (RepeatedMask[i] >= 8)
8784 BlendMask |= 1ull << i;
8785 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8786 DAG.getConstant(BlendMask, DL, MVT::i8));
8792 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8793 "256-bit byte-blends require AVX2 support!");
8795 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8797 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8798 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8799 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8802 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8803 if (SDValue Masked =
8804 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8807 // Scale the blend by the number of bytes per element.
8808 int Scale = VT.getScalarSizeInBits() / 8;
8810 // This form of blend is always done on bytes. Compute the byte vector
8812 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8814 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8815 // mix of LLVM's code generator and the x86 backend. We tell the code
8816 // generator that boolean values in the elements of an x86 vector register
8817 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8818 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8819 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8820 // of the element (the remaining are ignored) and 0 in that high bit would
8821 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8822 // the LLVM model for boolean values in vector elements gets the relevant
8823 // bit set, it is set backwards and over constrained relative to x86's
8825 SmallVector<SDValue, 32> VSELECTMask;
8826 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8827 for (int j = 0; j < Scale; ++j)
8828 VSELECTMask.push_back(
8829 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8830 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8833 V1 = DAG.getBitcast(BlendVT, V1);
8834 V2 = DAG.getBitcast(BlendVT, V2);
8835 return DAG.getBitcast(
8836 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8837 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8846 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8847 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8848 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8851 llvm_unreachable("Not a supported integer vector type!");
8855 /// \brief Try to lower as a blend of elements from two inputs followed by
8856 /// a single-input permutation.
8858 /// This matches the pattern where we can blend elements from two inputs and
8859 /// then reduce the shuffle to a single-input permutation.
8860 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8861 SDValue V1, SDValue V2,
8863 SelectionDAG &DAG) {
8864 // We build up the blend mask while checking whether a blend is a viable way
8865 // to reduce the shuffle.
8866 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8867 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8869 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8873 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8875 if (BlendMask[Mask[i] % Size] < 0)
8876 BlendMask[Mask[i] % Size] = Mask[i];
8877 else if (BlendMask[Mask[i] % Size] != Mask[i])
8878 return SDValue(); // Can't blend in the needed input!
8880 PermuteMask[i] = Mask[i] % Size;
8883 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8884 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8887 /// \brief Generic routine to decompose a shuffle and blend into independent
8888 /// blends and permutes.
8890 /// This matches the extremely common pattern for handling combined
8891 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8892 /// operations. It will try to pick the best arrangement of shuffles and
8894 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8898 SelectionDAG &DAG) {
8899 // Shuffle the input elements into the desired positions in V1 and V2 and
8900 // blend them together.
8901 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8902 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8903 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8904 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8905 if (Mask[i] >= 0 && Mask[i] < Size) {
8906 V1Mask[i] = Mask[i];
8908 } else if (Mask[i] >= Size) {
8909 V2Mask[i] = Mask[i] - Size;
8910 BlendMask[i] = i + Size;
8913 // Try to lower with the simpler initial blend strategy unless one of the
8914 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8915 // shuffle may be able to fold with a load or other benefit. However, when
8916 // we'll have to do 2x as many shuffles in order to achieve this, blending
8917 // first is a better strategy.
8918 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8919 if (SDValue BlendPerm =
8920 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8923 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8924 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8925 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8928 /// \brief Try to lower a vector shuffle as a rotation.
8930 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8931 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8932 ArrayRef<int> Mask) {
8933 int NumElts = Mask.size();
8935 // We need to detect various ways of spelling a rotation:
8936 // [11, 12, 13, 14, 15, 0, 1, 2]
8937 // [-1, 12, 13, 14, -1, -1, 1, -1]
8938 // [-1, -1, -1, -1, -1, -1, 1, 2]
8939 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8940 // [-1, 4, 5, 6, -1, -1, 9, -1]
8941 // [-1, 4, 5, 6, -1, -1, -1, -1]
8944 for (int i = 0; i < NumElts; ++i) {
8946 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8947 "Unexpected mask index.");
8951 // Determine where a rotated vector would have started.
8952 int StartIdx = i - (M % NumElts);
8954 // The identity rotation isn't interesting, stop.
8957 // If we found the tail of a vector the rotation must be the missing
8958 // front. If we found the head of a vector, it must be how much of the
8960 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8963 Rotation = CandidateRotation;
8964 else if (Rotation != CandidateRotation)
8965 // The rotations don't match, so we can't match this mask.
8968 // Compute which value this mask is pointing at.
8969 SDValue MaskV = M < NumElts ? V1 : V2;
8971 // Compute which of the two target values this index should be assigned
8972 // to. This reflects whether the high elements are remaining or the low
8973 // elements are remaining.
8974 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8976 // Either set up this value if we've not encountered it before, or check
8977 // that it remains consistent.
8980 else if (TargetV != MaskV)
8981 // This may be a rotation, but it pulls from the inputs in some
8982 // unsupported interleaving.
8986 // Check that we successfully analyzed the mask, and normalize the results.
8987 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8988 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9000 /// \brief Try to lower a vector shuffle as a byte rotation.
9002 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9003 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9004 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9005 /// try to generically lower a vector shuffle through such an pattern. It
9006 /// does not check for the profitability of lowering either as PALIGNR or
9007 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9008 /// This matches shuffle vectors that look like:
9010 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9012 /// Essentially it concatenates V1 and V2, shifts right by some number of
9013 /// elements, and takes the low elements as the result. Note that while this is
9014 /// specified as a *right shift* because x86 is little-endian, it is a *left
9015 /// rotate* of the vector lanes.
9016 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9017 ArrayRef<int> Mask) {
9018 // Don't accept any shuffles with zero elements.
9019 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9022 // PALIGNR works on 128-bit lanes.
9023 SmallVector<int, 16> RepeatedMask;
9024 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9027 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9031 // PALIGNR rotates bytes, so we need to scale the
9032 // rotation based on how many bytes are in the vector lane.
9033 int NumElts = RepeatedMask.size();
9034 int Scale = 16 / NumElts;
9035 return Rotation * Scale;
9038 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9039 SDValue V1, SDValue V2,
9041 const X86Subtarget &Subtarget,
9042 SelectionDAG &DAG) {
9043 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9045 SDValue Lo = V1, Hi = V2;
9046 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9047 if (ByteRotation <= 0)
9050 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9052 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9053 Lo = DAG.getBitcast(ByteVT, Lo);
9054 Hi = DAG.getBitcast(ByteVT, Hi);
9056 // SSSE3 targets can use the palignr instruction.
9057 if (Subtarget.hasSSSE3()) {
9058 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9059 "512-bit PALIGNR requires BWI instructions");
9060 return DAG.getBitcast(
9061 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9062 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9065 assert(VT.is128BitVector() &&
9066 "Rotate-based lowering only supports 128-bit lowering!");
9067 assert(Mask.size() <= 16 &&
9068 "Can shuffle at most 16 bytes in a 128-bit vector!");
9069 assert(ByteVT == MVT::v16i8 &&
9070 "SSE2 rotate lowering only needed for v16i8!");
9072 // Default SSE2 implementation
9073 int LoByteShift = 16 - ByteRotation;
9074 int HiByteShift = ByteRotation;
9076 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9077 DAG.getConstant(LoByteShift, DL, MVT::i8));
9078 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9079 DAG.getConstant(HiByteShift, DL, MVT::i8));
9080 return DAG.getBitcast(VT,
9081 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9084 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9086 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9087 /// rotation of the concatenation of two vectors; This routine will
9088 /// try to generically lower a vector shuffle through such an pattern.
9090 /// Essentially it concatenates V1 and V2, shifts right by some number of
9091 /// elements, and takes the low elements as the result. Note that while this is
9092 /// specified as a *right shift* because x86 is little-endian, it is a *left
9093 /// rotate* of the vector lanes.
9094 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9095 SDValue V1, SDValue V2,
9097 const X86Subtarget &Subtarget,
9098 SelectionDAG &DAG) {
9099 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9100 "Only 32-bit and 64-bit elements are supported!");
9102 // 128/256-bit vectors are only supported with VLX.
9103 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9104 && "VLX required for 128/256-bit vectors");
9106 SDValue Lo = V1, Hi = V2;
9107 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9111 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9112 DAG.getConstant(Rotation, DL, MVT::i8));
9115 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9117 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9118 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9119 /// matches elements from one of the input vectors shuffled to the left or
9120 /// right with zeroable elements 'shifted in'. It handles both the strictly
9121 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9124 /// PSHL : (little-endian) left bit shift.
9125 /// [ zz, 0, zz, 2 ]
9126 /// [ -1, 4, zz, -1 ]
9127 /// PSRL : (little-endian) right bit shift.
9129 /// [ -1, -1, 7, zz]
9130 /// PSLLDQ : (little-endian) left byte shift
9131 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9132 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9133 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9134 /// PSRLDQ : (little-endian) right byte shift
9135 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9136 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9137 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9138 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9139 unsigned ScalarSizeInBits,
9140 ArrayRef<int> Mask, int MaskOffset,
9141 const APInt &Zeroable,
9142 const X86Subtarget &Subtarget) {
9143 int Size = Mask.size();
9144 unsigned SizeInBits = Size * ScalarSizeInBits;
9146 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9147 for (int i = 0; i < Size; i += Scale)
9148 for (int j = 0; j < Shift; ++j)
9149 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9155 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9156 for (int i = 0; i != Size; i += Scale) {
9157 unsigned Pos = Left ? i + Shift : i;
9158 unsigned Low = Left ? i : i + Shift;
9159 unsigned Len = Scale - Shift;
9160 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9164 int ShiftEltBits = ScalarSizeInBits * Scale;
9165 bool ByteShift = ShiftEltBits > 64;
9166 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9167 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9168 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9170 // Normalize the scale for byte shifts to still produce an i64 element
9172 Scale = ByteShift ? Scale / 2 : Scale;
9174 // We need to round trip through the appropriate type for the shift.
9175 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9176 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9177 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9178 return (int)ShiftAmt;
9181 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9182 // keep doubling the size of the integer elements up to that. We can
9183 // then shift the elements of the integer vector by whole multiples of
9184 // their width within the elements of the larger integer vector. Test each
9185 // multiple to see if we can find a match with the moved element indices
9186 // and that the shifted in elements are all zeroable.
9187 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9188 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9189 for (int Shift = 1; Shift != Scale; ++Shift)
9190 for (bool Left : {true, false})
9191 if (CheckZeros(Shift, Scale, Left)) {
9192 int ShiftAmt = MatchShift(Shift, Scale, Left);
9201 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9202 SDValue V2, ArrayRef<int> Mask,
9203 const APInt &Zeroable,
9204 const X86Subtarget &Subtarget,
9205 SelectionDAG &DAG) {
9206 int Size = Mask.size();
9207 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9213 // Try to match shuffle against V1 shift.
9214 int ShiftAmt = matchVectorShuffleAsShift(
9215 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9217 // If V1 failed, try to match shuffle against V2 shift.
9220 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9221 Mask, Size, Zeroable, Subtarget);
9228 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9229 "Illegal integer vector type");
9230 V = DAG.getBitcast(ShiftVT, V);
9231 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9232 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9233 return DAG.getBitcast(VT, V);
9236 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9237 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9238 SDValue V2, ArrayRef<int> Mask,
9239 const APInt &Zeroable,
9240 SelectionDAG &DAG) {
9241 int Size = Mask.size();
9242 int HalfSize = Size / 2;
9243 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9244 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9246 // Upper half must be undefined.
9247 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9250 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9251 // Remainder of lower half result is zero and upper half is all undef.
9252 auto LowerAsEXTRQ = [&]() {
9253 // Determine the extraction length from the part of the
9254 // lower half that isn't zeroable.
9256 for (; Len > 0; --Len)
9257 if (!Zeroable[Len - 1])
9259 assert(Len > 0 && "Zeroable shuffle mask");
9261 // Attempt to match first Len sequential elements from the lower half.
9264 for (int i = 0; i != Len; ++i) {
9268 SDValue &V = (M < Size ? V1 : V2);
9271 // The extracted elements must start at a valid index and all mask
9272 // elements must be in the lower half.
9273 if (i > M || M >= HalfSize)
9276 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9287 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9288 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9289 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9290 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9291 DAG.getConstant(BitLen, DL, MVT::i8),
9292 DAG.getConstant(BitIdx, DL, MVT::i8));
9295 if (SDValue ExtrQ = LowerAsEXTRQ())
9298 // INSERTQ: Extract lowest Len elements from lower half of second source and
9299 // insert over first source, starting at Idx.
9300 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9301 auto LowerAsInsertQ = [&]() {
9302 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9305 // Attempt to match first source from mask before insertion point.
9306 if (isUndefInRange(Mask, 0, Idx)) {
9308 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9310 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9316 // Extend the extraction length looking to match both the insertion of
9317 // the second source and the remaining elements of the first.
9318 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9323 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9325 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9331 // Match the remaining elements of the lower half.
9332 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9334 } else if ((!Base || (Base == V1)) &&
9335 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9337 } else if ((!Base || (Base == V2)) &&
9338 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9345 // We may not have a base (first source) - this can safely be undefined.
9347 Base = DAG.getUNDEF(VT);
9349 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9350 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9351 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9352 DAG.getConstant(BitLen, DL, MVT::i8),
9353 DAG.getConstant(BitIdx, DL, MVT::i8));
9360 if (SDValue InsertQ = LowerAsInsertQ())
9366 /// \brief Lower a vector shuffle as a zero or any extension.
9368 /// Given a specific number of elements, element bit width, and extension
9369 /// stride, produce either a zero or any extension based on the available
9370 /// features of the subtarget. The extended elements are consecutive and
9371 /// begin and can start from an offsetted element index in the input; to
9372 /// avoid excess shuffling the offset must either being in the bottom lane
9373 /// or at the start of a higher lane. All extended elements must be from
9375 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9376 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9377 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9378 assert(Scale > 1 && "Need a scale to extend.");
9379 int EltBits = VT.getScalarSizeInBits();
9380 int NumElements = VT.getVectorNumElements();
9381 int NumEltsPerLane = 128 / EltBits;
9382 int OffsetLane = Offset / NumEltsPerLane;
9383 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9384 "Only 8, 16, and 32 bit elements can be extended.");
9385 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9386 assert(0 <= Offset && "Extension offset must be positive.");
9387 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9388 "Extension offset must be in the first lane or start an upper lane.");
9390 // Check that an index is in same lane as the base offset.
9391 auto SafeOffset = [&](int Idx) {
9392 return OffsetLane == (Idx / NumEltsPerLane);
9395 // Shift along an input so that the offset base moves to the first element.
9396 auto ShuffleOffset = [&](SDValue V) {
9400 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9401 for (int i = 0; i * Scale < NumElements; ++i) {
9402 int SrcIdx = i + Offset;
9403 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9405 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9408 // Found a valid zext mask! Try various lowering strategies based on the
9409 // input type and available ISA extensions.
9410 if (Subtarget.hasSSE41()) {
9411 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9412 // PUNPCK will catch this in a later shuffle match.
9413 if (Offset && Scale == 2 && VT.is128BitVector())
9415 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9416 NumElements / Scale);
9417 InputV = ShuffleOffset(InputV);
9418 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9419 return DAG.getBitcast(VT, InputV);
9422 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9424 // For any extends we can cheat for larger element sizes and use shuffle
9425 // instructions that can fold with a load and/or copy.
9426 if (AnyExt && EltBits == 32) {
9427 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9429 return DAG.getBitcast(
9430 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9431 DAG.getBitcast(MVT::v4i32, InputV),
9432 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9434 if (AnyExt && EltBits == 16 && Scale > 2) {
9435 int PSHUFDMask[4] = {Offset / 2, -1,
9436 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9437 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9438 DAG.getBitcast(MVT::v4i32, InputV),
9439 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9440 int PSHUFWMask[4] = {1, -1, -1, -1};
9441 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9442 return DAG.getBitcast(
9443 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9444 DAG.getBitcast(MVT::v8i16, InputV),
9445 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9448 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9450 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9451 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9452 assert(VT.is128BitVector() && "Unexpected vector width!");
9454 int LoIdx = Offset * EltBits;
9455 SDValue Lo = DAG.getBitcast(
9456 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9457 DAG.getConstant(EltBits, DL, MVT::i8),
9458 DAG.getConstant(LoIdx, DL, MVT::i8)));
9460 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9461 !SafeOffset(Offset + 1))
9462 return DAG.getBitcast(VT, Lo);
9464 int HiIdx = (Offset + 1) * EltBits;
9465 SDValue Hi = DAG.getBitcast(
9466 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9467 DAG.getConstant(EltBits, DL, MVT::i8),
9468 DAG.getConstant(HiIdx, DL, MVT::i8)));
9469 return DAG.getBitcast(VT,
9470 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9473 // If this would require more than 2 unpack instructions to expand, use
9474 // pshufb when available. We can only use more than 2 unpack instructions
9475 // when zero extending i8 elements which also makes it easier to use pshufb.
9476 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9477 assert(NumElements == 16 && "Unexpected byte vector width!");
9478 SDValue PSHUFBMask[16];
9479 for (int i = 0; i < 16; ++i) {
9480 int Idx = Offset + (i / Scale);
9481 PSHUFBMask[i] = DAG.getConstant(
9482 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9484 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9485 return DAG.getBitcast(
9486 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9487 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9490 // If we are extending from an offset, ensure we start on a boundary that
9491 // we can unpack from.
9492 int AlignToUnpack = Offset % (NumElements / Scale);
9493 if (AlignToUnpack) {
9494 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9495 for (int i = AlignToUnpack; i < NumElements; ++i)
9496 ShMask[i - AlignToUnpack] = i;
9497 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9498 Offset -= AlignToUnpack;
9501 // Otherwise emit a sequence of unpacks.
9503 unsigned UnpackLoHi = X86ISD::UNPCKL;
9504 if (Offset >= (NumElements / 2)) {
9505 UnpackLoHi = X86ISD::UNPCKH;
9506 Offset -= (NumElements / 2);
9509 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9510 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9511 : getZeroVector(InputVT, Subtarget, DAG, DL);
9512 InputV = DAG.getBitcast(InputVT, InputV);
9513 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9517 } while (Scale > 1);
9518 return DAG.getBitcast(VT, InputV);
9521 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9523 /// This routine will try to do everything in its power to cleverly lower
9524 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9525 /// check for the profitability of this lowering, it tries to aggressively
9526 /// match this pattern. It will use all of the micro-architectural details it
9527 /// can to emit an efficient lowering. It handles both blends with all-zero
9528 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9529 /// masking out later).
9531 /// The reason we have dedicated lowering for zext-style shuffles is that they
9532 /// are both incredibly common and often quite performance sensitive.
9533 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9534 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9535 const APInt &Zeroable, const X86Subtarget &Subtarget,
9536 SelectionDAG &DAG) {
9537 int Bits = VT.getSizeInBits();
9538 int NumLanes = Bits / 128;
9539 int NumElements = VT.getVectorNumElements();
9540 int NumEltsPerLane = NumElements / NumLanes;
9541 assert(VT.getScalarSizeInBits() <= 32 &&
9542 "Exceeds 32-bit integer zero extension limit");
9543 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9545 // Define a helper function to check a particular ext-scale and lower to it if
9547 auto Lower = [&](int Scale) -> SDValue {
9552 for (int i = 0; i < NumElements; ++i) {
9555 continue; // Valid anywhere but doesn't tell us anything.
9556 if (i % Scale != 0) {
9557 // Each of the extended elements need to be zeroable.
9561 // We no longer are in the anyext case.
9566 // Each of the base elements needs to be consecutive indices into the
9567 // same input vector.
9568 SDValue V = M < NumElements ? V1 : V2;
9569 M = M % NumElements;
9572 Offset = M - (i / Scale);
9573 } else if (InputV != V)
9574 return SDValue(); // Flip-flopping inputs.
9576 // Offset must start in the lowest 128-bit lane or at the start of an
9578 // FIXME: Is it ever worth allowing a negative base offset?
9579 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9580 (Offset % NumEltsPerLane) == 0))
9583 // If we are offsetting, all referenced entries must come from the same
9585 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9588 if ((M % NumElements) != (Offset + (i / Scale)))
9589 return SDValue(); // Non-consecutive strided elements.
9593 // If we fail to find an input, we have a zero-shuffle which should always
9594 // have already been handled.
9595 // FIXME: Maybe handle this here in case during blending we end up with one?
9599 // If we are offsetting, don't extend if we only match a single input, we
9600 // can always do better by using a basic PSHUF or PUNPCK.
9601 if (Offset != 0 && Matches < 2)
9604 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9605 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9608 // The widest scale possible for extending is to a 64-bit integer.
9609 assert(Bits % 64 == 0 &&
9610 "The number of bits in a vector must be divisible by 64 on x86!");
9611 int NumExtElements = Bits / 64;
9613 // Each iteration, try extending the elements half as much, but into twice as
9615 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9616 assert(NumElements % NumExtElements == 0 &&
9617 "The input vector size must be divisible by the extended size.");
9618 if (SDValue V = Lower(NumElements / NumExtElements))
9622 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9626 // Returns one of the source operands if the shuffle can be reduced to a
9627 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9628 auto CanZExtLowHalf = [&]() {
9629 for (int i = NumElements / 2; i != NumElements; ++i)
9632 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9634 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9639 if (SDValue V = CanZExtLowHalf()) {
9640 V = DAG.getBitcast(MVT::v2i64, V);
9641 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9642 return DAG.getBitcast(VT, V);
9645 // No viable ext lowering found.
9649 /// \brief Try to get a scalar value for a specific element of a vector.
9651 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9652 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9653 SelectionDAG &DAG) {
9654 MVT VT = V.getSimpleValueType();
9655 MVT EltVT = VT.getVectorElementType();
9656 V = peekThroughBitcasts(V);
9658 // If the bitcasts shift the element size, we can't extract an equivalent
9660 MVT NewVT = V.getSimpleValueType();
9661 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9664 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9665 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9666 // Ensure the scalar operand is the same size as the destination.
9667 // FIXME: Add support for scalar truncation where possible.
9668 SDValue S = V.getOperand(Idx);
9669 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9670 return DAG.getBitcast(EltVT, S);
9676 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9678 /// This is particularly important because the set of instructions varies
9679 /// significantly based on whether the operand is a load or not.
9680 static bool isShuffleFoldableLoad(SDValue V) {
9681 V = peekThroughBitcasts(V);
9682 return ISD::isNON_EXTLoad(V.getNode());
9685 /// \brief Try to lower insertion of a single element into a zero vector.
9687 /// This is a common pattern that we have especially efficient patterns to lower
9688 /// across all subtarget feature sets.
9689 static SDValue lowerVectorShuffleAsElementInsertion(
9690 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9691 const APInt &Zeroable, const X86Subtarget &Subtarget,
9692 SelectionDAG &DAG) {
9694 MVT EltVT = VT.getVectorElementType();
9697 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9699 bool IsV1Zeroable = true;
9700 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9701 if (i != V2Index && !Zeroable[i]) {
9702 IsV1Zeroable = false;
9706 // Check for a single input from a SCALAR_TO_VECTOR node.
9707 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9708 // all the smarts here sunk into that routine. However, the current
9709 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9710 // vector shuffle lowering is dead.
9711 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9713 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9714 // We need to zext the scalar if it is smaller than an i32.
9715 V2S = DAG.getBitcast(EltVT, V2S);
9716 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9717 // Using zext to expand a narrow element won't work for non-zero
9722 // Zero-extend directly to i32.
9724 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9726 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9727 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9728 EltVT == MVT::i16) {
9729 // Either not inserting from the low element of the input or the input
9730 // element size is too small to use VZEXT_MOVL to clear the high bits.
9734 if (!IsV1Zeroable) {
9735 // If V1 can't be treated as a zero vector we have fewer options to lower
9736 // this. We can't support integer vectors or non-zero targets cheaply, and
9737 // the V1 elements can't be permuted in any way.
9738 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9739 if (!VT.isFloatingPoint() || V2Index != 0)
9741 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9742 V1Mask[V2Index] = -1;
9743 if (!isNoopShuffleMask(V1Mask))
9745 // This is essentially a special case blend operation, but if we have
9746 // general purpose blend operations, they are always faster. Bail and let
9747 // the rest of the lowering handle these as blends.
9748 if (Subtarget.hasSSE41())
9751 // Otherwise, use MOVSD or MOVSS.
9752 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9753 "Only two types of floating point element types to handle!");
9754 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9758 // This lowering only works for the low element with floating point vectors.
9759 if (VT.isFloatingPoint() && V2Index != 0)
9762 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9764 V2 = DAG.getBitcast(VT, V2);
9767 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9768 // the desired position. Otherwise it is more efficient to do a vector
9769 // shift left. We know that we can do a vector shift left because all
9770 // the inputs are zero.
9771 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9772 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9773 V2Shuffle[V2Index] = 0;
9774 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9776 V2 = DAG.getBitcast(MVT::v16i8, V2);
9778 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9779 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9780 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9781 DAG.getDataLayout(), VT)));
9782 V2 = DAG.getBitcast(VT, V2);
9788 /// Try to lower broadcast of a single - truncated - integer element,
9789 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9791 /// This assumes we have AVX2.
9792 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9793 SDValue V0, int BroadcastIdx,
9794 const X86Subtarget &Subtarget,
9795 SelectionDAG &DAG) {
9796 assert(Subtarget.hasAVX2() &&
9797 "We can only lower integer broadcasts with AVX2!");
9799 EVT EltVT = VT.getVectorElementType();
9800 EVT V0VT = V0.getValueType();
9802 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9803 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9805 EVT V0EltVT = V0VT.getVectorElementType();
9806 if (!V0EltVT.isInteger())
9809 const unsigned EltSize = EltVT.getSizeInBits();
9810 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9812 // This is only a truncation if the original element type is larger.
9813 if (V0EltSize <= EltSize)
9816 assert(((V0EltSize % EltSize) == 0) &&
9817 "Scalar type sizes must all be powers of 2 on x86!");
9819 const unsigned V0Opc = V0.getOpcode();
9820 const unsigned Scale = V0EltSize / EltSize;
9821 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9823 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9824 V0Opc != ISD::BUILD_VECTOR)
9827 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9829 // If we're extracting non-least-significant bits, shift so we can truncate.
9830 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9831 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9832 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9833 if (const int OffsetIdx = BroadcastIdx % Scale)
9834 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9835 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9837 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9838 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9841 /// \brief Try to lower broadcast of a single element.
9843 /// For convenience, this code also bundles all of the subtarget feature set
9844 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9845 /// a convenient way to factor it out.
9846 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9847 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9848 SDValue V1, SDValue V2,
9850 const X86Subtarget &Subtarget,
9851 SelectionDAG &DAG) {
9852 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9853 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9854 (Subtarget.hasAVX2() && VT.isInteger())))
9857 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9858 // we can only broadcast from a register with AVX2.
9859 unsigned NumElts = Mask.size();
9860 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9861 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9863 // Check that the mask is a broadcast.
9864 int BroadcastIdx = -1;
9865 for (int i = 0; i != (int)NumElts; ++i) {
9866 SmallVector<int, 8> BroadcastMask(NumElts, i);
9867 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9873 if (BroadcastIdx < 0)
9875 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9876 "a sorted mask where the broadcast "
9879 // Go up the chain of (vector) values to find a scalar load that we can
9880 // combine with the broadcast.
9883 switch (V.getOpcode()) {
9884 case ISD::BITCAST: {
9885 SDValue VSrc = V.getOperand(0);
9886 MVT SrcVT = VSrc.getSimpleValueType();
9887 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9892 case ISD::CONCAT_VECTORS: {
9893 int OperandSize = Mask.size() / V.getNumOperands();
9894 V = V.getOperand(BroadcastIdx / OperandSize);
9895 BroadcastIdx %= OperandSize;
9898 case ISD::INSERT_SUBVECTOR: {
9899 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9900 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9904 int BeginIdx = (int)ConstantIdx->getZExtValue();
9906 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9907 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9908 BroadcastIdx -= BeginIdx;
9919 // Check if this is a broadcast of a scalar. We special case lowering
9920 // for scalars so that we can more effectively fold with loads.
9921 // First, look through bitcast: if the original value has a larger element
9922 // type than the shuffle, the broadcast element is in essence truncated.
9923 // Make that explicit to ease folding.
9924 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9925 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9926 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9927 return TruncBroadcast;
9929 MVT BroadcastVT = VT;
9931 // Peek through any bitcast (only useful for loads).
9932 SDValue BC = peekThroughBitcasts(V);
9934 // Also check the simpler case, where we can directly reuse the scalar.
9935 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9936 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9937 V = V.getOperand(BroadcastIdx);
9939 // If we can't broadcast from a register, check that the input is a load.
9940 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9942 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9943 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9944 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9945 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9946 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9949 // If we are broadcasting a load that is only used by the shuffle
9950 // then we can reduce the vector load to the broadcasted scalar load.
9951 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9952 SDValue BaseAddr = Ld->getOperand(1);
9953 EVT SVT = BroadcastVT.getScalarType();
9954 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9955 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9956 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9957 DAG.getMachineFunction().getMachineMemOperand(
9958 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9960 // Make sure the newly-created LOAD is in the same position as Ld in
9961 // terms of dependency. We create a TokenFactor for Ld and V,
9962 // and update uses of Ld's output chain to use the TokenFactor.
9963 if (Ld->hasAnyUseOfValue(1)) {
9964 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9965 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9966 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9967 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9968 SDValue(V.getNode(), 1));
9970 } else if (!BroadcastFromReg) {
9971 // We can't broadcast from a vector register.
9973 } else if (BroadcastIdx != 0) {
9974 // We can only broadcast from the zero-element of a vector register,
9975 // but it can be advantageous to broadcast from the zero-element of a
9977 if (!VT.is256BitVector() && !VT.is512BitVector())
9980 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9981 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9984 // Only broadcast the zero-element of a 128-bit subvector.
9985 unsigned EltSize = VT.getScalarSizeInBits();
9986 if (((BroadcastIdx * EltSize) % 128) != 0)
9989 // The shuffle input might have been a bitcast we looked through; look at
9990 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9991 // later bitcast it to BroadcastVT.
9992 MVT SrcVT = V.getSimpleValueType();
9993 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9994 "Unexpected vector element size");
9995 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9996 "Unexpected vector size");
9998 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9999 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10000 DAG.getIntPtrConstant(BroadcastIdx, DL));
10003 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10004 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10005 DAG.getBitcast(MVT::f64, V));
10007 // Bitcast back to the same scalar type as BroadcastVT.
10008 MVT SrcVT = V.getSimpleValueType();
10009 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10010 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10011 "Unexpected vector element size");
10012 if (SrcVT.isVector()) {
10013 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10014 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10016 SrcVT = BroadcastVT.getScalarType();
10018 V = DAG.getBitcast(SrcVT, V);
10021 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10022 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10023 V = DAG.getBitcast(MVT::f64, V);
10024 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10025 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10028 // We only support broadcasting from 128-bit vectors to minimize the
10029 // number of patterns we need to deal with in isel. So extract down to
10031 if (SrcVT.getSizeInBits() > 128)
10032 V = extract128BitVector(V, 0, DAG, DL);
10034 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10037 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10038 // INSERTPS when the V1 elements are already in the correct locations
10039 // because otherwise we can just always use two SHUFPS instructions which
10040 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10041 // perform INSERTPS if a single V1 element is out of place and all V2
10042 // elements are zeroable.
10043 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10044 unsigned &InsertPSMask,
10045 const APInt &Zeroable,
10046 ArrayRef<int> Mask,
10047 SelectionDAG &DAG) {
10048 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10049 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10050 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10052 // Attempt to match INSERTPS with one element from VA or VB being
10053 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10055 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10056 ArrayRef<int> CandidateMask) {
10057 unsigned ZMask = 0;
10058 int VADstIndex = -1;
10059 int VBDstIndex = -1;
10060 bool VAUsedInPlace = false;
10062 for (int i = 0; i < 4; ++i) {
10063 // Synthesize a zero mask from the zeroable elements (includes undefs).
10069 // Flag if we use any VA inputs in place.
10070 if (i == CandidateMask[i]) {
10071 VAUsedInPlace = true;
10075 // We can only insert a single non-zeroable element.
10076 if (VADstIndex >= 0 || VBDstIndex >= 0)
10079 if (CandidateMask[i] < 4) {
10080 // VA input out of place for insertion.
10083 // VB input for insertion.
10088 // Don't bother if we have no (non-zeroable) element for insertion.
10089 if (VADstIndex < 0 && VBDstIndex < 0)
10092 // Determine element insertion src/dst indices. The src index is from the
10093 // start of the inserted vector, not the start of the concatenated vector.
10094 unsigned VBSrcIndex = 0;
10095 if (VADstIndex >= 0) {
10096 // If we have a VA input out of place, we use VA as the V2 element
10097 // insertion and don't use the original V2 at all.
10098 VBSrcIndex = CandidateMask[VADstIndex];
10099 VBDstIndex = VADstIndex;
10102 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10105 // If no V1 inputs are used in place, then the result is created only from
10106 // the zero mask and the V2 insertion - so remove V1 dependency.
10107 if (!VAUsedInPlace)
10108 VA = DAG.getUNDEF(MVT::v4f32);
10110 // Update V1, V2 and InsertPSMask accordingly.
10114 // Insert the V2 element into the desired position.
10115 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10120 if (matchAsInsertPS(V1, V2, Mask))
10123 // Commute and try again.
10124 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10125 ShuffleVectorSDNode::commuteMask(CommutedMask);
10126 if (matchAsInsertPS(V2, V1, CommutedMask))
10132 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10133 SDValue V2, ArrayRef<int> Mask,
10134 const APInt &Zeroable,
10135 SelectionDAG &DAG) {
10136 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10137 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10139 // Attempt to match the insertps pattern.
10140 unsigned InsertPSMask;
10141 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10144 // Insert the V2 element into the desired position.
10145 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10146 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10149 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10150 /// UNPCK instruction.
10152 /// This specifically targets cases where we end up with alternating between
10153 /// the two inputs, and so can permute them into something that feeds a single
10154 /// UNPCK instruction. Note that this routine only targets integer vectors
10155 /// because for floating point vectors we have a generalized SHUFPS lowering
10156 /// strategy that handles everything that doesn't *exactly* match an unpack,
10157 /// making this clever lowering unnecessary.
10158 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10159 SDValue V1, SDValue V2,
10160 ArrayRef<int> Mask,
10161 SelectionDAG &DAG) {
10162 assert(!VT.isFloatingPoint() &&
10163 "This routine only supports integer vectors.");
10164 assert(VT.is128BitVector() &&
10165 "This routine only works on 128-bit vectors.");
10166 assert(!V2.isUndef() &&
10167 "This routine should only be used when blending two inputs.");
10168 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10170 int Size = Mask.size();
10173 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10175 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10177 bool UnpackLo = NumLoInputs >= NumHiInputs;
10179 auto TryUnpack = [&](int ScalarSize, int Scale) {
10180 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10181 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10183 for (int i = 0; i < Size; ++i) {
10187 // Each element of the unpack contains Scale elements from this mask.
10188 int UnpackIdx = i / Scale;
10190 // We only handle the case where V1 feeds the first slots of the unpack.
10191 // We rely on canonicalization to ensure this is the case.
10192 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10195 // Setup the mask for this input. The indexing is tricky as we have to
10196 // handle the unpack stride.
10197 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10198 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10202 // If we will have to shuffle both inputs to use the unpack, check whether
10203 // we can just unpack first and shuffle the result. If so, skip this unpack.
10204 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10205 !isNoopShuffleMask(V2Mask))
10208 // Shuffle the inputs into place.
10209 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10210 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10212 // Cast the inputs to the type we will use to unpack them.
10213 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10214 V1 = DAG.getBitcast(UnpackVT, V1);
10215 V2 = DAG.getBitcast(UnpackVT, V2);
10217 // Unpack the inputs and cast the result back to the desired type.
10218 return DAG.getBitcast(
10219 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10220 UnpackVT, V1, V2));
10223 // We try each unpack from the largest to the smallest to try and find one
10224 // that fits this mask.
10225 int OrigScalarSize = VT.getScalarSizeInBits();
10226 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10227 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10230 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10232 if (NumLoInputs == 0 || NumHiInputs == 0) {
10233 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10234 "We have to have *some* inputs!");
10235 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10237 // FIXME: We could consider the total complexity of the permute of each
10238 // possible unpacking. Or at the least we should consider how many
10239 // half-crossings are created.
10240 // FIXME: We could consider commuting the unpacks.
10242 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10243 for (int i = 0; i < Size; ++i) {
10247 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10250 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10252 return DAG.getVectorShuffle(
10253 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10255 DAG.getUNDEF(VT), PermMask);
10261 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10263 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10264 /// support for floating point shuffles but not integer shuffles. These
10265 /// instructions will incur a domain crossing penalty on some chips though so
10266 /// it is better to avoid lowering through this for integer vectors where
10268 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10269 const APInt &Zeroable,
10270 SDValue V1, SDValue V2,
10271 const X86Subtarget &Subtarget,
10272 SelectionDAG &DAG) {
10273 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10274 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10275 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10277 if (V2.isUndef()) {
10278 // Check for being able to broadcast a single element.
10279 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10280 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10283 // Straight shuffle of a single input vector. Simulate this by using the
10284 // single input as both of the "inputs" to this instruction..
10285 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10287 if (Subtarget.hasAVX()) {
10288 // If we have AVX, we can use VPERMILPS which will allow folding a load
10289 // into the shuffle.
10290 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10291 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10294 return DAG.getNode(
10295 X86ISD::SHUFP, DL, MVT::v2f64,
10296 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10297 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10298 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10300 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10301 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10303 // If we have a single input, insert that into V1 if we can do so cheaply.
10304 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10305 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10306 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10308 // Try inverting the insertion since for v2 masks it is easy to do and we
10309 // can't reliably sort the mask one way or the other.
10310 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10311 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10312 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10313 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10317 // Try to use one of the special instruction patterns to handle two common
10318 // blend patterns if a zero-blend above didn't work.
10319 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10320 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10321 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10322 // We can either use a special instruction to load over the low double or
10323 // to move just the low double.
10324 return DAG.getNode(
10325 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10326 DL, MVT::v2f64, V2,
10327 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10329 if (Subtarget.hasSSE41())
10330 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10331 Zeroable, Subtarget, DAG))
10334 // Use dedicated unpack instructions for masks that match their pattern.
10336 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10339 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10340 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10341 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10344 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10346 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10347 /// the integer unit to minimize domain crossing penalties. However, for blends
10348 /// it falls back to the floating point shuffle operation with appropriate bit
10350 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10351 const APInt &Zeroable,
10352 SDValue V1, SDValue V2,
10353 const X86Subtarget &Subtarget,
10354 SelectionDAG &DAG) {
10355 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10356 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10357 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10359 if (V2.isUndef()) {
10360 // Check for being able to broadcast a single element.
10361 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10362 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10365 // Straight shuffle of a single input vector. For everything from SSE2
10366 // onward this has a single fast instruction with no scary immediates.
10367 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10368 V1 = DAG.getBitcast(MVT::v4i32, V1);
10369 int WidenedMask[4] = {
10370 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10371 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10372 return DAG.getBitcast(
10374 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10375 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10377 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10378 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10379 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10380 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10382 // If we have a blend of two same-type PACKUS operations and the blend aligns
10383 // with the low and high halves, we can just merge the PACKUS operations.
10384 // This is particularly important as it lets us merge shuffles that this
10385 // routine itself creates.
10386 auto GetPackNode = [](SDValue V) {
10387 V = peekThroughBitcasts(V);
10388 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10390 if (SDValue V1Pack = GetPackNode(V1))
10391 if (SDValue V2Pack = GetPackNode(V2)) {
10392 EVT PackVT = V1Pack.getValueType();
10393 if (PackVT == V2Pack.getValueType())
10394 return DAG.getBitcast(MVT::v2i64,
10395 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10396 Mask[0] == 0 ? V1Pack.getOperand(0)
10397 : V1Pack.getOperand(1),
10398 Mask[1] == 2 ? V2Pack.getOperand(0)
10399 : V2Pack.getOperand(1)));
10402 // Try to use shift instructions.
10403 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10404 Zeroable, Subtarget, DAG))
10407 // When loading a scalar and then shuffling it into a vector we can often do
10408 // the insertion cheaply.
10409 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10410 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10412 // Try inverting the insertion since for v2 masks it is easy to do and we
10413 // can't reliably sort the mask one way or the other.
10414 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10415 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10416 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10419 // We have different paths for blend lowering, but they all must use the
10420 // *exact* same predicate.
10421 bool IsBlendSupported = Subtarget.hasSSE41();
10422 if (IsBlendSupported)
10423 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10424 Zeroable, Subtarget, DAG))
10427 // Use dedicated unpack instructions for masks that match their pattern.
10429 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10432 // Try to use byte rotation instructions.
10433 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10434 if (Subtarget.hasSSSE3())
10435 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10436 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10439 // If we have direct support for blends, we should lower by decomposing into
10440 // a permute. That will be faster than the domain cross.
10441 if (IsBlendSupported)
10442 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10445 // We implement this with SHUFPD which is pretty lame because it will likely
10446 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10447 // However, all the alternatives are still more cycles and newer chips don't
10448 // have this problem. It would be really nice if x86 had better shuffles here.
10449 V1 = DAG.getBitcast(MVT::v2f64, V1);
10450 V2 = DAG.getBitcast(MVT::v2f64, V2);
10451 return DAG.getBitcast(MVT::v2i64,
10452 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10455 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10457 /// This is used to disable more specialized lowerings when the shufps lowering
10458 /// will happen to be efficient.
10459 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10460 // This routine only handles 128-bit shufps.
10461 assert(Mask.size() == 4 && "Unsupported mask size!");
10462 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10463 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10464 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10465 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10467 // To lower with a single SHUFPS we need to have the low half and high half
10468 // each requiring a single input.
10469 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10471 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10477 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10479 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10480 /// It makes no assumptions about whether this is the *best* lowering, it simply
10482 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10483 ArrayRef<int> Mask, SDValue V1,
10484 SDValue V2, SelectionDAG &DAG) {
10485 SDValue LowV = V1, HighV = V2;
10486 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10488 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10490 if (NumV2Elements == 1) {
10491 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10493 // Compute the index adjacent to V2Index and in the same half by toggling
10495 int V2AdjIndex = V2Index ^ 1;
10497 if (Mask[V2AdjIndex] < 0) {
10498 // Handles all the cases where we have a single V2 element and an undef.
10499 // This will only ever happen in the high lanes because we commute the
10500 // vector otherwise.
10502 std::swap(LowV, HighV);
10503 NewMask[V2Index] -= 4;
10505 // Handle the case where the V2 element ends up adjacent to a V1 element.
10506 // To make this work, blend them together as the first step.
10507 int V1Index = V2AdjIndex;
10508 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10509 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10510 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10512 // Now proceed to reconstruct the final blend as we have the necessary
10513 // high or low half formed.
10520 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10521 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10523 } else if (NumV2Elements == 2) {
10524 if (Mask[0] < 4 && Mask[1] < 4) {
10525 // Handle the easy case where we have V1 in the low lanes and V2 in the
10529 } else if (Mask[2] < 4 && Mask[3] < 4) {
10530 // We also handle the reversed case because this utility may get called
10531 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10532 // arrange things in the right direction.
10538 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10539 // trying to place elements directly, just blend them and set up the final
10540 // shuffle to place them.
10542 // The first two blend mask elements are for V1, the second two are for
10544 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10545 Mask[2] < 4 ? Mask[2] : Mask[3],
10546 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10547 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10548 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10549 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10551 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10554 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10555 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10556 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10557 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10560 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10561 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10564 /// \brief Lower 4-lane 32-bit floating point shuffles.
10566 /// Uses instructions exclusively from the floating point unit to minimize
10567 /// domain crossing penalties, as these are sufficient to implement all v4f32
10569 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10570 const APInt &Zeroable,
10571 SDValue V1, SDValue V2,
10572 const X86Subtarget &Subtarget,
10573 SelectionDAG &DAG) {
10574 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10575 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10576 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10578 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10580 if (NumV2Elements == 0) {
10581 // Check for being able to broadcast a single element.
10582 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10583 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10586 // Use even/odd duplicate instructions for masks that match their pattern.
10587 if (Subtarget.hasSSE3()) {
10588 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10589 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10590 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10591 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10594 if (Subtarget.hasAVX()) {
10595 // If we have AVX, we can use VPERMILPS which will allow folding a load
10596 // into the shuffle.
10597 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10601 // Otherwise, use a straight shuffle of a single input vector. We pass the
10602 // input vector to both operands to simulate this with a SHUFPS.
10603 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10604 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10607 // There are special ways we can lower some single-element blends. However, we
10608 // have custom ways we can lower more complex single-element blends below that
10609 // we defer to if both this and BLENDPS fail to match, so restrict this to
10610 // when the V2 input is targeting element 0 of the mask -- that is the fast
10612 if (NumV2Elements == 1 && Mask[0] >= 4)
10613 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10614 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10617 if (Subtarget.hasSSE41()) {
10618 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10619 Zeroable, Subtarget, DAG))
10622 // Use INSERTPS if we can complete the shuffle efficiently.
10624 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10627 if (!isSingleSHUFPSMask(Mask))
10628 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10629 DL, MVT::v4f32, V1, V2, Mask, DAG))
10633 // Use low/high mov instructions.
10634 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10635 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10636 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10637 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10639 // Use dedicated unpack instructions for masks that match their pattern.
10641 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10644 // Otherwise fall back to a SHUFPS lowering strategy.
10645 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10648 /// \brief Lower 4-lane i32 vector shuffles.
10650 /// We try to handle these with integer-domain shuffles where we can, but for
10651 /// blends we use the floating point domain blend instructions.
10652 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10653 const APInt &Zeroable,
10654 SDValue V1, SDValue V2,
10655 const X86Subtarget &Subtarget,
10656 SelectionDAG &DAG) {
10657 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10658 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10659 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10661 // Whenever we can lower this as a zext, that instruction is strictly faster
10662 // than any alternative. It also allows us to fold memory operands into the
10663 // shuffle in many cases.
10664 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10665 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10668 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10670 if (NumV2Elements == 0) {
10671 // Check for being able to broadcast a single element.
10672 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10673 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10676 // Straight shuffle of a single input vector. For everything from SSE2
10677 // onward this has a single fast instruction with no scary immediates.
10678 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10679 // but we aren't actually going to use the UNPCK instruction because doing
10680 // so prevents folding a load into this instruction or making a copy.
10681 const int UnpackLoMask[] = {0, 0, 1, 1};
10682 const int UnpackHiMask[] = {2, 2, 3, 3};
10683 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10684 Mask = UnpackLoMask;
10685 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10686 Mask = UnpackHiMask;
10688 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10689 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10692 // Try to use shift instructions.
10693 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10694 Zeroable, Subtarget, DAG))
10697 // There are special ways we can lower some single-element blends.
10698 if (NumV2Elements == 1)
10699 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10700 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10703 // We have different paths for blend lowering, but they all must use the
10704 // *exact* same predicate.
10705 bool IsBlendSupported = Subtarget.hasSSE41();
10706 if (IsBlendSupported)
10707 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10708 Zeroable, Subtarget, DAG))
10711 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10715 // Use dedicated unpack instructions for masks that match their pattern.
10717 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10720 // Try to use byte rotation instructions.
10721 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10722 if (Subtarget.hasSSSE3())
10723 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10724 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10727 // Assume that a single SHUFPS is faster than an alternative sequence of
10728 // multiple instructions (even if the CPU has a domain penalty).
10729 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10730 if (!isSingleSHUFPSMask(Mask)) {
10731 // If we have direct support for blends, we should lower by decomposing into
10732 // a permute. That will be faster than the domain cross.
10733 if (IsBlendSupported)
10734 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10737 // Try to lower by permuting the inputs into an unpack instruction.
10738 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10739 DL, MVT::v4i32, V1, V2, Mask, DAG))
10743 // We implement this with SHUFPS because it can blend from two vectors.
10744 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10745 // up the inputs, bypassing domain shift penalties that we would incur if we
10746 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10748 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10749 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10750 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10751 return DAG.getBitcast(MVT::v4i32, ShufPS);
10754 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10755 /// shuffle lowering, and the most complex part.
10757 /// The lowering strategy is to try to form pairs of input lanes which are
10758 /// targeted at the same half of the final vector, and then use a dword shuffle
10759 /// to place them onto the right half, and finally unpack the paired lanes into
10760 /// their final position.
10762 /// The exact breakdown of how to form these dword pairs and align them on the
10763 /// correct sides is really tricky. See the comments within the function for
10764 /// more of the details.
10766 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10767 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10768 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10769 /// vector, form the analogous 128-bit 8-element Mask.
10770 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10771 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10772 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10773 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10774 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10776 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10777 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10778 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10780 SmallVector<int, 4> LoInputs;
10781 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10782 std::sort(LoInputs.begin(), LoInputs.end());
10783 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10784 SmallVector<int, 4> HiInputs;
10785 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10786 std::sort(HiInputs.begin(), HiInputs.end());
10787 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10789 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10790 int NumHToL = LoInputs.size() - NumLToL;
10792 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10793 int NumHToH = HiInputs.size() - NumLToH;
10794 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10795 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10796 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10797 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10799 // If we are splatting two values from one half - one to each half, then
10800 // we can shuffle that half so each is splatted to a dword, then splat those
10801 // to their respective halves.
10802 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10804 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10805 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10806 V = DAG.getNode(ShufWOp, DL, VT, V,
10807 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10808 V = DAG.getBitcast(PSHUFDVT, V);
10809 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10810 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10811 return DAG.getBitcast(VT, V);
10814 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10815 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10816 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10817 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10819 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10820 // such inputs we can swap two of the dwords across the half mark and end up
10821 // with <=2 inputs to each half in each half. Once there, we can fall through
10822 // to the generic code below. For example:
10824 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10825 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10827 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10828 // and an existing 2-into-2 on the other half. In this case we may have to
10829 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10830 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10831 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10832 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10833 // half than the one we target for fixing) will be fixed when we re-enter this
10834 // path. We will also combine away any sequence of PSHUFD instructions that
10835 // result into a single instruction. Here is an example of the tricky case:
10837 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10838 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10840 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10842 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10843 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10845 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10846 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10848 // The result is fine to be handled by the generic logic.
10849 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10850 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10851 int AOffset, int BOffset) {
10852 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10853 "Must call this with A having 3 or 1 inputs from the A half.");
10854 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10855 "Must call this with B having 1 or 3 inputs from the B half.");
10856 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10857 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10859 bool ThreeAInputs = AToAInputs.size() == 3;
10861 // Compute the index of dword with only one word among the three inputs in
10862 // a half by taking the sum of the half with three inputs and subtracting
10863 // the sum of the actual three inputs. The difference is the remaining
10865 int ADWord, BDWord;
10866 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10867 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10868 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10869 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10870 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10871 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10872 int TripleNonInputIdx =
10873 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10874 TripleDWord = TripleNonInputIdx / 2;
10876 // We use xor with one to compute the adjacent DWord to whichever one the
10878 OneInputDWord = (OneInput / 2) ^ 1;
10880 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10881 // and BToA inputs. If there is also such a problem with the BToB and AToB
10882 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10883 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10884 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10885 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10886 // Compute how many inputs will be flipped by swapping these DWords. We
10888 // to balance this to ensure we don't form a 3-1 shuffle in the other
10890 int NumFlippedAToBInputs =
10891 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10892 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10893 int NumFlippedBToBInputs =
10894 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10895 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10896 if ((NumFlippedAToBInputs == 1 &&
10897 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10898 (NumFlippedBToBInputs == 1 &&
10899 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10900 // We choose whether to fix the A half or B half based on whether that
10901 // half has zero flipped inputs. At zero, we may not be able to fix it
10902 // with that half. We also bias towards fixing the B half because that
10903 // will more commonly be the high half, and we have to bias one way.
10904 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10905 ArrayRef<int> Inputs) {
10906 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10907 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10908 // Determine whether the free index is in the flipped dword or the
10909 // unflipped dword based on where the pinned index is. We use this bit
10910 // in an xor to conditionally select the adjacent dword.
10911 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10912 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10913 if (IsFixIdxInput == IsFixFreeIdxInput)
10915 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10916 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10917 "We need to be changing the number of flipped inputs!");
10918 int PSHUFHalfMask[] = {0, 1, 2, 3};
10919 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10920 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10922 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10924 for (int &M : Mask)
10925 if (M >= 0 && M == FixIdx)
10927 else if (M >= 0 && M == FixFreeIdx)
10930 if (NumFlippedBToBInputs != 0) {
10932 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10933 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10935 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10936 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10937 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10942 int PSHUFDMask[] = {0, 1, 2, 3};
10943 PSHUFDMask[ADWord] = BDWord;
10944 PSHUFDMask[BDWord] = ADWord;
10945 V = DAG.getBitcast(
10947 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10948 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10950 // Adjust the mask to match the new locations of A and B.
10951 for (int &M : Mask)
10952 if (M >= 0 && M/2 == ADWord)
10953 M = 2 * BDWord + M % 2;
10954 else if (M >= 0 && M/2 == BDWord)
10955 M = 2 * ADWord + M % 2;
10957 // Recurse back into this routine to re-compute state now that this isn't
10958 // a 3 and 1 problem.
10959 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10962 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10963 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10964 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10965 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10967 // At this point there are at most two inputs to the low and high halves from
10968 // each half. That means the inputs can always be grouped into dwords and
10969 // those dwords can then be moved to the correct half with a dword shuffle.
10970 // We use at most one low and one high word shuffle to collect these paired
10971 // inputs into dwords, and finally a dword shuffle to place them.
10972 int PSHUFLMask[4] = {-1, -1, -1, -1};
10973 int PSHUFHMask[4] = {-1, -1, -1, -1};
10974 int PSHUFDMask[4] = {-1, -1, -1, -1};
10976 // First fix the masks for all the inputs that are staying in their
10977 // original halves. This will then dictate the targets of the cross-half
10979 auto fixInPlaceInputs =
10980 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10981 MutableArrayRef<int> SourceHalfMask,
10982 MutableArrayRef<int> HalfMask, int HalfOffset) {
10983 if (InPlaceInputs.empty())
10985 if (InPlaceInputs.size() == 1) {
10986 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10987 InPlaceInputs[0] - HalfOffset;
10988 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10991 if (IncomingInputs.empty()) {
10992 // Just fix all of the in place inputs.
10993 for (int Input : InPlaceInputs) {
10994 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10995 PSHUFDMask[Input / 2] = Input / 2;
11000 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11001 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11002 InPlaceInputs[0] - HalfOffset;
11003 // Put the second input next to the first so that they are packed into
11004 // a dword. We find the adjacent index by toggling the low bit.
11005 int AdjIndex = InPlaceInputs[0] ^ 1;
11006 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11007 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11008 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11010 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11011 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11013 // Now gather the cross-half inputs and place them into a free dword of
11014 // their target half.
11015 // FIXME: This operation could almost certainly be simplified dramatically to
11016 // look more like the 3-1 fixing operation.
11017 auto moveInputsToRightHalf = [&PSHUFDMask](
11018 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11019 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11020 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11022 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11023 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11025 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11027 int LowWord = Word & ~1;
11028 int HighWord = Word | 1;
11029 return isWordClobbered(SourceHalfMask, LowWord) ||
11030 isWordClobbered(SourceHalfMask, HighWord);
11033 if (IncomingInputs.empty())
11036 if (ExistingInputs.empty()) {
11037 // Map any dwords with inputs from them into the right half.
11038 for (int Input : IncomingInputs) {
11039 // If the source half mask maps over the inputs, turn those into
11040 // swaps and use the swapped lane.
11041 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11042 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11043 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11044 Input - SourceOffset;
11045 // We have to swap the uses in our half mask in one sweep.
11046 for (int &M : HalfMask)
11047 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11049 else if (M == Input)
11050 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11052 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11053 Input - SourceOffset &&
11054 "Previous placement doesn't match!");
11056 // Note that this correctly re-maps both when we do a swap and when
11057 // we observe the other side of the swap above. We rely on that to
11058 // avoid swapping the members of the input list directly.
11059 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11062 // Map the input's dword into the correct half.
11063 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11064 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11066 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11068 "Previous placement doesn't match!");
11071 // And just directly shift any other-half mask elements to be same-half
11072 // as we will have mirrored the dword containing the element into the
11073 // same position within that half.
11074 for (int &M : HalfMask)
11075 if (M >= SourceOffset && M < SourceOffset + 4) {
11076 M = M - SourceOffset + DestOffset;
11077 assert(M >= 0 && "This should never wrap below zero!");
11082 // Ensure we have the input in a viable dword of its current half. This
11083 // is particularly tricky because the original position may be clobbered
11084 // by inputs being moved and *staying* in that half.
11085 if (IncomingInputs.size() == 1) {
11086 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11087 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11089 SourceHalfMask[InputFixed - SourceOffset] =
11090 IncomingInputs[0] - SourceOffset;
11091 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11093 IncomingInputs[0] = InputFixed;
11095 } else if (IncomingInputs.size() == 2) {
11096 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11097 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11098 // We have two non-adjacent or clobbered inputs we need to extract from
11099 // the source half. To do this, we need to map them into some adjacent
11100 // dword slot in the source mask.
11101 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11102 IncomingInputs[1] - SourceOffset};
11104 // If there is a free slot in the source half mask adjacent to one of
11105 // the inputs, place the other input in it. We use (Index XOR 1) to
11106 // compute an adjacent index.
11107 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11108 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11109 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11110 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11111 InputsFixed[1] = InputsFixed[0] ^ 1;
11112 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11113 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11114 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11115 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11116 InputsFixed[0] = InputsFixed[1] ^ 1;
11117 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11118 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11119 // The two inputs are in the same DWord but it is clobbered and the
11120 // adjacent DWord isn't used at all. Move both inputs to the free
11122 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11123 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11124 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11125 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11127 // The only way we hit this point is if there is no clobbering
11128 // (because there are no off-half inputs to this half) and there is no
11129 // free slot adjacent to one of the inputs. In this case, we have to
11130 // swap an input with a non-input.
11131 for (int i = 0; i < 4; ++i)
11132 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11133 "We can't handle any clobbers here!");
11134 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11135 "Cannot have adjacent inputs here!");
11137 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11138 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11140 // We also have to update the final source mask in this case because
11141 // it may need to undo the above swap.
11142 for (int &M : FinalSourceHalfMask)
11143 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11144 M = InputsFixed[1] + SourceOffset;
11145 else if (M == InputsFixed[1] + SourceOffset)
11146 M = (InputsFixed[0] ^ 1) + SourceOffset;
11148 InputsFixed[1] = InputsFixed[0] ^ 1;
11151 // Point everything at the fixed inputs.
11152 for (int &M : HalfMask)
11153 if (M == IncomingInputs[0])
11154 M = InputsFixed[0] + SourceOffset;
11155 else if (M == IncomingInputs[1])
11156 M = InputsFixed[1] + SourceOffset;
11158 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11159 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11162 llvm_unreachable("Unhandled input size!");
11165 // Now hoist the DWord down to the right half.
11166 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11167 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11168 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11169 for (int &M : HalfMask)
11170 for (int Input : IncomingInputs)
11172 M = FreeDWord * 2 + Input % 2;
11174 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11175 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11176 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11177 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11179 // Now enact all the shuffles we've computed to move the inputs into their
11181 if (!isNoopShuffleMask(PSHUFLMask))
11182 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11183 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11184 if (!isNoopShuffleMask(PSHUFHMask))
11185 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11186 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11187 if (!isNoopShuffleMask(PSHUFDMask))
11188 V = DAG.getBitcast(
11190 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11191 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11193 // At this point, each half should contain all its inputs, and we can then
11194 // just shuffle them into their final position.
11195 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11196 "Failed to lift all the high half inputs to the low mask!");
11197 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11198 "Failed to lift all the low half inputs to the high mask!");
11200 // Do a half shuffle for the low mask.
11201 if (!isNoopShuffleMask(LoMask))
11202 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11203 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11205 // Do a half shuffle with the high mask after shifting its values down.
11206 for (int &M : HiMask)
11209 if (!isNoopShuffleMask(HiMask))
11210 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11211 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11216 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11217 /// blend if only one input is used.
11218 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11219 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11220 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11222 SDValue V1Mask[16];
11223 SDValue V2Mask[16];
11227 int Size = Mask.size();
11228 int Scale = 16 / Size;
11229 for (int i = 0; i < 16; ++i) {
11230 if (Mask[i / Scale] < 0) {
11231 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11233 const int ZeroMask = 0x80;
11234 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11236 int V2Idx = Mask[i / Scale] < Size
11238 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11239 if (Zeroable[i / Scale])
11240 V1Idx = V2Idx = ZeroMask;
11241 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11242 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11243 V1InUse |= (ZeroMask != V1Idx);
11244 V2InUse |= (ZeroMask != V2Idx);
11249 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11250 DAG.getBitcast(MVT::v16i8, V1),
11251 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11253 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11254 DAG.getBitcast(MVT::v16i8, V2),
11255 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11257 // If we need shuffled inputs from both, blend the two.
11259 if (V1InUse && V2InUse)
11260 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11262 V = V1InUse ? V1 : V2;
11264 // Cast the result back to the correct type.
11265 return DAG.getBitcast(VT, V);
11268 /// \brief Generic lowering of 8-lane i16 shuffles.
11270 /// This handles both single-input shuffles and combined shuffle/blends with
11271 /// two inputs. The single input shuffles are immediately delegated to
11272 /// a dedicated lowering routine.
11274 /// The blends are lowered in one of three fundamental ways. If there are few
11275 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11276 /// of the input is significantly cheaper when lowered as an interleaving of
11277 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11278 /// halves of the inputs separately (making them have relatively few inputs)
11279 /// and then concatenate them.
11280 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11281 const APInt &Zeroable,
11282 SDValue V1, SDValue V2,
11283 const X86Subtarget &Subtarget,
11284 SelectionDAG &DAG) {
11285 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11286 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11287 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11289 // Whenever we can lower this as a zext, that instruction is strictly faster
11290 // than any alternative.
11291 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11292 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11295 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11297 if (NumV2Inputs == 0) {
11298 // Check for being able to broadcast a single element.
11299 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11300 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11303 // Try to use shift instructions.
11304 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11305 Zeroable, Subtarget, DAG))
11308 // Use dedicated unpack instructions for masks that match their pattern.
11310 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11313 // Try to use byte rotation instructions.
11314 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11315 Mask, Subtarget, DAG))
11318 // Make a copy of the mask so it can be modified.
11319 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11320 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11321 MutableMask, Subtarget,
11325 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11326 "All single-input shuffles should be canonicalized to be V1-input "
11329 // Try to use shift instructions.
11330 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11331 Zeroable, Subtarget, DAG))
11334 // See if we can use SSE4A Extraction / Insertion.
11335 if (Subtarget.hasSSE4A())
11336 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11340 // There are special ways we can lower some single-element blends.
11341 if (NumV2Inputs == 1)
11342 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11343 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11346 // We have different paths for blend lowering, but they all must use the
11347 // *exact* same predicate.
11348 bool IsBlendSupported = Subtarget.hasSSE41();
11349 if (IsBlendSupported)
11350 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11351 Zeroable, Subtarget, DAG))
11354 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11358 // Use dedicated unpack instructions for masks that match their pattern.
11360 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11363 // Try to use byte rotation instructions.
11364 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11365 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11368 if (SDValue BitBlend =
11369 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11372 // Try to lower by permuting the inputs into an unpack instruction.
11373 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11377 // If we can't directly blend but can use PSHUFB, that will be better as it
11378 // can both shuffle and set up the inefficient blend.
11379 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11380 bool V1InUse, V2InUse;
11381 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11382 Zeroable, DAG, V1InUse, V2InUse);
11385 // We can always bit-blend if we have to so the fallback strategy is to
11386 // decompose into single-input permutes and blends.
11387 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11391 /// \brief Check whether a compaction lowering can be done by dropping even
11392 /// elements and compute how many times even elements must be dropped.
11394 /// This handles shuffles which take every Nth element where N is a power of
11395 /// two. Example shuffle masks:
11397 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11398 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11399 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11400 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11401 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11402 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11404 /// Any of these lanes can of course be undef.
11406 /// This routine only supports N <= 3.
11407 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11410 /// \returns N above, or the number of times even elements must be dropped if
11411 /// there is such a number. Otherwise returns zero.
11412 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11413 bool IsSingleInput) {
11414 // The modulus for the shuffle vector entries is based on whether this is
11415 // a single input or not.
11416 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11417 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11418 "We should only be called with masks with a power-of-2 size!");
11420 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11422 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11423 // and 2^3 simultaneously. This is because we may have ambiguity with
11424 // partially undef inputs.
11425 bool ViableForN[3] = {true, true, true};
11427 for (int i = 0, e = Mask.size(); i < e; ++i) {
11428 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11433 bool IsAnyViable = false;
11434 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11435 if (ViableForN[j]) {
11436 uint64_t N = j + 1;
11438 // The shuffle mask must be equal to (i * 2^N) % M.
11439 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11440 IsAnyViable = true;
11442 ViableForN[j] = false;
11444 // Early exit if we exhaust the possible powers of two.
11449 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11453 // Return 0 as there is no viable power of two.
11457 /// \brief Generic lowering of v16i8 shuffles.
11459 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11460 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11461 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11462 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11464 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11465 const APInt &Zeroable,
11466 SDValue V1, SDValue V2,
11467 const X86Subtarget &Subtarget,
11468 SelectionDAG &DAG) {
11469 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11470 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11471 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11473 // Try to use shift instructions.
11474 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11475 Zeroable, Subtarget, DAG))
11478 // Try to use byte rotation instructions.
11479 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11480 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11483 // Try to use a zext lowering.
11484 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11485 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11488 // See if we can use SSE4A Extraction / Insertion.
11489 if (Subtarget.hasSSE4A())
11490 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11494 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11496 // For single-input shuffles, there are some nicer lowering tricks we can use.
11497 if (NumV2Elements == 0) {
11498 // Check for being able to broadcast a single element.
11499 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11500 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11503 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11504 // Notably, this handles splat and partial-splat shuffles more efficiently.
11505 // However, it only makes sense if the pre-duplication shuffle simplifies
11506 // things significantly. Currently, this means we need to be able to
11507 // express the pre-duplication shuffle as an i16 shuffle.
11509 // FIXME: We should check for other patterns which can be widened into an
11510 // i16 shuffle as well.
11511 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11512 for (int i = 0; i < 16; i += 2)
11513 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11518 auto tryToWidenViaDuplication = [&]() -> SDValue {
11519 if (!canWidenViaDuplication(Mask))
11521 SmallVector<int, 4> LoInputs;
11522 copy_if(Mask, std::back_inserter(LoInputs),
11523 [](int M) { return M >= 0 && M < 8; });
11524 std::sort(LoInputs.begin(), LoInputs.end());
11525 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11527 SmallVector<int, 4> HiInputs;
11528 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11529 std::sort(HiInputs.begin(), HiInputs.end());
11530 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11533 bool TargetLo = LoInputs.size() >= HiInputs.size();
11534 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11535 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11537 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11538 SmallDenseMap<int, int, 8> LaneMap;
11539 for (int I : InPlaceInputs) {
11540 PreDupI16Shuffle[I/2] = I/2;
11543 int j = TargetLo ? 0 : 4, je = j + 4;
11544 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11545 // Check if j is already a shuffle of this input. This happens when
11546 // there are two adjacent bytes after we move the low one.
11547 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11548 // If we haven't yet mapped the input, search for a slot into which
11550 while (j < je && PreDupI16Shuffle[j] >= 0)
11554 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11557 // Map this input with the i16 shuffle.
11558 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11561 // Update the lane map based on the mapping we ended up with.
11562 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11564 V1 = DAG.getBitcast(
11566 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11567 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11569 // Unpack the bytes to form the i16s that will be shuffled into place.
11570 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11571 MVT::v16i8, V1, V1);
11573 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11574 for (int i = 0; i < 16; ++i)
11575 if (Mask[i] >= 0) {
11576 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11577 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11578 if (PostDupI16Shuffle[i / 2] < 0)
11579 PostDupI16Shuffle[i / 2] = MappedMask;
11581 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11582 "Conflicting entries in the original shuffle!");
11584 return DAG.getBitcast(
11586 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11587 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11589 if (SDValue V = tryToWidenViaDuplication())
11593 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11597 // Use dedicated unpack instructions for masks that match their pattern.
11599 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11602 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11603 // with PSHUFB. It is important to do this before we attempt to generate any
11604 // blends but after all of the single-input lowerings. If the single input
11605 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11606 // want to preserve that and we can DAG combine any longer sequences into
11607 // a PSHUFB in the end. But once we start blending from multiple inputs,
11608 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11609 // and there are *very* few patterns that would actually be faster than the
11610 // PSHUFB approach because of its ability to zero lanes.
11612 // FIXME: The only exceptions to the above are blends which are exact
11613 // interleavings with direct instructions supporting them. We currently don't
11614 // handle those well here.
11615 if (Subtarget.hasSSSE3()) {
11616 bool V1InUse = false;
11617 bool V2InUse = false;
11619 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11620 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11622 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11623 // do so. This avoids using them to handle blends-with-zero which is
11624 // important as a single pshufb is significantly faster for that.
11625 if (V1InUse && V2InUse) {
11626 if (Subtarget.hasSSE41())
11627 if (SDValue Blend = lowerVectorShuffleAsBlend(
11628 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11631 // We can use an unpack to do the blending rather than an or in some
11632 // cases. Even though the or may be (very minorly) more efficient, we
11633 // preference this lowering because there are common cases where part of
11634 // the complexity of the shuffles goes away when we do the final blend as
11636 // FIXME: It might be worth trying to detect if the unpack-feeding
11637 // shuffles will both be pshufb, in which case we shouldn't bother with
11639 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11640 DL, MVT::v16i8, V1, V2, Mask, DAG))
11647 // There are special ways we can lower some single-element blends.
11648 if (NumV2Elements == 1)
11649 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11650 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11653 if (SDValue BitBlend =
11654 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11657 // Check whether a compaction lowering can be done. This handles shuffles
11658 // which take every Nth element for some even N. See the helper function for
11661 // We special case these as they can be particularly efficiently handled with
11662 // the PACKUSB instruction on x86 and they show up in common patterns of
11663 // rearranging bytes to truncate wide elements.
11664 bool IsSingleInput = V2.isUndef();
11665 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11666 // NumEvenDrops is the power of two stride of the elements. Another way of
11667 // thinking about it is that we need to drop the even elements this many
11668 // times to get the original input.
11670 // First we need to zero all the dropped bytes.
11671 assert(NumEvenDrops <= 3 &&
11672 "No support for dropping even elements more than 3 times.");
11673 // We use the mask type to pick which bytes are preserved based on how many
11674 // elements are dropped.
11675 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11676 SDValue ByteClearMask = DAG.getBitcast(
11677 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11678 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11679 if (!IsSingleInput)
11680 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11682 // Now pack things back together.
11683 V1 = DAG.getBitcast(MVT::v8i16, V1);
11684 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11685 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11686 for (int i = 1; i < NumEvenDrops; ++i) {
11687 Result = DAG.getBitcast(MVT::v8i16, Result);
11688 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11694 // Handle multi-input cases by blending single-input shuffles.
11695 if (NumV2Elements > 0)
11696 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11699 // The fallback path for single-input shuffles widens this into two v8i16
11700 // vectors with unpacks, shuffles those, and then pulls them back together
11704 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11705 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11706 for (int i = 0; i < 16; ++i)
11708 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11710 SDValue VLoHalf, VHiHalf;
11711 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11712 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11714 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11715 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11716 // Use a mask to drop the high bytes.
11717 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11718 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11719 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11721 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11722 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11724 // Squash the masks to point directly into VLoHalf.
11725 for (int &M : LoBlendMask)
11728 for (int &M : HiBlendMask)
11732 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11733 // VHiHalf so that we can blend them as i16s.
11734 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11736 VLoHalf = DAG.getBitcast(
11737 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11738 VHiHalf = DAG.getBitcast(
11739 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11742 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11743 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11745 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11748 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11750 /// This routine breaks down the specific type of 128-bit shuffle and
11751 /// dispatches to the lowering routines accordingly.
11752 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11753 MVT VT, SDValue V1, SDValue V2,
11754 const APInt &Zeroable,
11755 const X86Subtarget &Subtarget,
11756 SelectionDAG &DAG) {
11757 switch (VT.SimpleTy) {
11759 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11761 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11763 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11765 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11767 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11769 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11772 llvm_unreachable("Unimplemented!");
11776 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11778 /// This routine just extracts two subvectors, shuffles them independently, and
11779 /// then concatenates them back together. This should work effectively with all
11780 /// AVX vector shuffle types.
11781 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11782 SDValue V2, ArrayRef<int> Mask,
11783 SelectionDAG &DAG) {
11784 assert(VT.getSizeInBits() >= 256 &&
11785 "Only for 256-bit or wider vector shuffles!");
11786 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11787 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11789 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11790 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11792 int NumElements = VT.getVectorNumElements();
11793 int SplitNumElements = NumElements / 2;
11794 MVT ScalarVT = VT.getVectorElementType();
11795 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11797 // Rather than splitting build-vectors, just build two narrower build
11798 // vectors. This helps shuffling with splats and zeros.
11799 auto SplitVector = [&](SDValue V) {
11800 V = peekThroughBitcasts(V);
11802 MVT OrigVT = V.getSimpleValueType();
11803 int OrigNumElements = OrigVT.getVectorNumElements();
11804 int OrigSplitNumElements = OrigNumElements / 2;
11805 MVT OrigScalarVT = OrigVT.getVectorElementType();
11806 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11810 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11812 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11813 DAG.getIntPtrConstant(0, DL));
11814 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11815 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11818 SmallVector<SDValue, 16> LoOps, HiOps;
11819 for (int i = 0; i < OrigSplitNumElements; ++i) {
11820 LoOps.push_back(BV->getOperand(i));
11821 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11823 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11824 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11826 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11827 DAG.getBitcast(SplitVT, HiV));
11830 SDValue LoV1, HiV1, LoV2, HiV2;
11831 std::tie(LoV1, HiV1) = SplitVector(V1);
11832 std::tie(LoV2, HiV2) = SplitVector(V2);
11834 // Now create two 4-way blends of these half-width vectors.
11835 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11836 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11837 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11838 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11839 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11840 for (int i = 0; i < SplitNumElements; ++i) {
11841 int M = HalfMask[i];
11842 if (M >= NumElements) {
11843 if (M >= NumElements + SplitNumElements)
11847 V2BlendMask[i] = M - NumElements;
11848 BlendMask[i] = SplitNumElements + i;
11849 } else if (M >= 0) {
11850 if (M >= SplitNumElements)
11854 V1BlendMask[i] = M;
11859 // Because the lowering happens after all combining takes place, we need to
11860 // manually combine these blend masks as much as possible so that we create
11861 // a minimal number of high-level vector shuffle nodes.
11863 // First try just blending the halves of V1 or V2.
11864 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11865 return DAG.getUNDEF(SplitVT);
11866 if (!UseLoV2 && !UseHiV2)
11867 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11868 if (!UseLoV1 && !UseHiV1)
11869 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11871 SDValue V1Blend, V2Blend;
11872 if (UseLoV1 && UseHiV1) {
11874 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11876 // We only use half of V1 so map the usage down into the final blend mask.
11877 V1Blend = UseLoV1 ? LoV1 : HiV1;
11878 for (int i = 0; i < SplitNumElements; ++i)
11879 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11880 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11882 if (UseLoV2 && UseHiV2) {
11884 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11886 // We only use half of V2 so map the usage down into the final blend mask.
11887 V2Blend = UseLoV2 ? LoV2 : HiV2;
11888 for (int i = 0; i < SplitNumElements; ++i)
11889 if (BlendMask[i] >= SplitNumElements)
11890 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11892 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11894 SDValue Lo = HalfBlend(LoMask);
11895 SDValue Hi = HalfBlend(HiMask);
11896 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11899 /// \brief Either split a vector in halves or decompose the shuffles and the
11902 /// This is provided as a good fallback for many lowerings of non-single-input
11903 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11904 /// between splitting the shuffle into 128-bit components and stitching those
11905 /// back together vs. extracting the single-input shuffles and blending those
11907 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11908 SDValue V1, SDValue V2,
11909 ArrayRef<int> Mask,
11910 SelectionDAG &DAG) {
11911 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11912 "shuffles as it could then recurse on itself.");
11913 int Size = Mask.size();
11915 // If this can be modeled as a broadcast of two elements followed by a blend,
11916 // prefer that lowering. This is especially important because broadcasts can
11917 // often fold with memory operands.
11918 auto DoBothBroadcast = [&] {
11919 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11922 if (V2BroadcastIdx < 0)
11923 V2BroadcastIdx = M - Size;
11924 else if (M - Size != V2BroadcastIdx)
11926 } else if (M >= 0) {
11927 if (V1BroadcastIdx < 0)
11928 V1BroadcastIdx = M;
11929 else if (M != V1BroadcastIdx)
11934 if (DoBothBroadcast())
11935 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11938 // If the inputs all stem from a single 128-bit lane of each input, then we
11939 // split them rather than blending because the split will decompose to
11940 // unusually few instructions.
11941 int LaneCount = VT.getSizeInBits() / 128;
11942 int LaneSize = Size / LaneCount;
11943 SmallBitVector LaneInputs[2];
11944 LaneInputs[0].resize(LaneCount, false);
11945 LaneInputs[1].resize(LaneCount, false);
11946 for (int i = 0; i < Size; ++i)
11948 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11949 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11950 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11952 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11953 // that the decomposed single-input shuffles don't end up here.
11954 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11957 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11958 /// a permutation and blend of those lanes.
11960 /// This essentially blends the out-of-lane inputs to each lane into the lane
11961 /// from a permuted copy of the vector. This lowering strategy results in four
11962 /// instructions in the worst case for a single-input cross lane shuffle which
11963 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11964 /// of. Special cases for each particular shuffle pattern should be handled
11965 /// prior to trying this lowering.
11966 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11967 SDValue V1, SDValue V2,
11968 ArrayRef<int> Mask,
11969 SelectionDAG &DAG) {
11970 // FIXME: This should probably be generalized for 512-bit vectors as well.
11971 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11972 int Size = Mask.size();
11973 int LaneSize = Size / 2;
11975 // If there are only inputs from one 128-bit lane, splitting will in fact be
11976 // less expensive. The flags track whether the given lane contains an element
11977 // that crosses to another lane.
11978 bool LaneCrossing[2] = {false, false};
11979 for (int i = 0; i < Size; ++i)
11980 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11981 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11982 if (!LaneCrossing[0] || !LaneCrossing[1])
11983 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11985 assert(V2.isUndef() &&
11986 "This last part of this routine only works on single input shuffles");
11988 SmallVector<int, 32> FlippedBlendMask(Size);
11989 for (int i = 0; i < Size; ++i)
11990 FlippedBlendMask[i] =
11991 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11993 : Mask[i] % LaneSize +
11994 (i / LaneSize) * LaneSize + Size);
11996 // Flip the vector, and blend the results which should now be in-lane. The
11997 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11998 // 5 for the high source. The value 3 selects the high half of source 2 and
11999 // the value 2 selects the low half of source 2. We only use source 2 to
12000 // allow folding it into a memory operand.
12001 unsigned PERMMask = 3 | 2 << 4;
12002 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12003 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12004 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12007 /// \brief Handle lowering 2-lane 128-bit shuffles.
12008 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12009 SDValue V2, ArrayRef<int> Mask,
12010 const APInt &Zeroable,
12011 const X86Subtarget &Subtarget,
12012 SelectionDAG &DAG) {
12013 SmallVector<int, 4> WidenedMask;
12014 if (!canWidenShuffleElements(Mask, WidenedMask))
12017 // TODO: If minimizing size and one of the inputs is a zero vector and the
12018 // the zero vector has only one use, we could use a VPERM2X128 to save the
12019 // instruction bytes needed to explicitly generate the zero vector.
12021 // Blends are faster and handle all the non-lane-crossing cases.
12022 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12023 Zeroable, Subtarget, DAG))
12026 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12027 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12029 // If either input operand is a zero vector, use VPERM2X128 because its mask
12030 // allows us to replace the zero input with an implicit zero.
12031 if (!IsV1Zero && !IsV2Zero) {
12032 // Check for patterns which can be matched with a single insert of a 128-bit
12034 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12035 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12036 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
12037 if (Subtarget.hasAVX2() && V2.isUndef())
12040 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12041 VT.getVectorNumElements() / 2);
12042 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12043 DAG.getIntPtrConstant(0, DL));
12044 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12045 OnlyUsesV1 ? V1 : V2,
12046 DAG.getIntPtrConstant(0, DL));
12047 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12051 // Otherwise form a 128-bit permutation. After accounting for undefs,
12052 // convert the 64-bit shuffle mask selection values into 128-bit
12053 // selection bits by dividing the indexes by 2 and shifting into positions
12054 // defined by a vperm2*128 instruction's immediate control byte.
12056 // The immediate permute control byte looks like this:
12057 // [1:0] - select 128 bits from sources for low half of destination
12059 // [3] - zero low half of destination
12060 // [5:4] - select 128 bits from sources for high half of destination
12062 // [7] - zero high half of destination
12064 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12065 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12067 unsigned PermMask = MaskLO | (MaskHI << 4);
12069 // If either input is a zero vector, replace it with an undef input.
12070 // Shuffle mask values < 4 are selecting elements of V1.
12071 // Shuffle mask values >= 4 are selecting elements of V2.
12072 // Adjust each half of the permute mask by clearing the half that was
12073 // selecting the zero vector and setting the zero mask bit.
12075 V1 = DAG.getUNDEF(VT);
12077 PermMask = (PermMask & 0xf0) | 0x08;
12079 PermMask = (PermMask & 0x0f) | 0x80;
12082 V2 = DAG.getUNDEF(VT);
12084 PermMask = (PermMask & 0xf0) | 0x08;
12086 PermMask = (PermMask & 0x0f) | 0x80;
12089 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12090 DAG.getConstant(PermMask, DL, MVT::i8));
12093 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12094 /// shuffling each lane.
12096 /// This will only succeed when the result of fixing the 128-bit lanes results
12097 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12098 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12099 /// the lane crosses early and then use simpler shuffles within each lane.
12101 /// FIXME: It might be worthwhile at some point to support this without
12102 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12103 /// in x86 only floating point has interesting non-repeating shuffles, and even
12104 /// those are still *marginally* more expensive.
12105 static SDValue lowerVectorShuffleByMerging128BitLanes(
12106 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12107 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12108 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12110 int Size = Mask.size();
12111 int LaneSize = 128 / VT.getScalarSizeInBits();
12112 int NumLanes = Size / LaneSize;
12113 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12115 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12116 // check whether the in-128-bit lane shuffles share a repeating pattern.
12117 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12118 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12119 for (int i = 0; i < Size; ++i) {
12123 int j = i / LaneSize;
12125 if (Lanes[j] < 0) {
12126 // First entry we've seen for this lane.
12127 Lanes[j] = Mask[i] / LaneSize;
12128 } else if (Lanes[j] != Mask[i] / LaneSize) {
12129 // This doesn't match the lane selected previously!
12133 // Check that within each lane we have a consistent shuffle mask.
12134 int k = i % LaneSize;
12135 if (InLaneMask[k] < 0) {
12136 InLaneMask[k] = Mask[i] % LaneSize;
12137 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12138 // This doesn't fit a repeating in-lane mask.
12143 // First shuffle the lanes into place.
12144 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12145 VT.getSizeInBits() / 64);
12146 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12147 for (int i = 0; i < NumLanes; ++i)
12148 if (Lanes[i] >= 0) {
12149 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12150 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12153 V1 = DAG.getBitcast(LaneVT, V1);
12154 V2 = DAG.getBitcast(LaneVT, V2);
12155 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12157 // Cast it back to the type we actually want.
12158 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12160 // Now do a simple shuffle that isn't lane crossing.
12161 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12162 for (int i = 0; i < Size; ++i)
12164 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12165 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12166 "Must not introduce lane crosses at this point!");
12168 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12171 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12172 /// This allows for fast cases such as subvector extraction/insertion
12173 /// or shuffling smaller vector types which can lower more efficiently.
12174 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12175 SDValue V1, SDValue V2,
12176 ArrayRef<int> Mask,
12177 const X86Subtarget &Subtarget,
12178 SelectionDAG &DAG) {
12179 assert(VT.is256BitVector() && "Expected 256-bit vector");
12181 unsigned NumElts = VT.getVectorNumElements();
12182 unsigned HalfNumElts = NumElts / 2;
12183 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12185 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12186 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12187 if (!UndefLower && !UndefUpper)
12190 // Upper half is undef and lower half is whole upper subvector.
12191 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12193 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12194 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12195 DAG.getIntPtrConstant(HalfNumElts, DL));
12196 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12197 DAG.getIntPtrConstant(0, DL));
12200 // Lower half is undef and upper half is whole lower subvector.
12201 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12203 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12204 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12205 DAG.getIntPtrConstant(0, DL));
12206 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12207 DAG.getIntPtrConstant(HalfNumElts, DL));
12210 // If the shuffle only uses two of the four halves of the input operands,
12211 // then extract them and perform the 'half' shuffle at half width.
12212 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12213 int HalfIdx1 = -1, HalfIdx2 = -1;
12214 SmallVector<int, 8> HalfMask(HalfNumElts);
12215 unsigned Offset = UndefLower ? HalfNumElts : 0;
12216 for (unsigned i = 0; i != HalfNumElts; ++i) {
12217 int M = Mask[i + Offset];
12223 // Determine which of the 4 half vectors this element is from.
12224 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12225 int HalfIdx = M / HalfNumElts;
12227 // Determine the element index into its half vector source.
12228 int HalfElt = M % HalfNumElts;
12230 // We can shuffle with up to 2 half vectors, set the new 'half'
12231 // shuffle mask accordingly.
12232 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12233 HalfMask[i] = HalfElt;
12234 HalfIdx1 = HalfIdx;
12237 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12238 HalfMask[i] = HalfElt + HalfNumElts;
12239 HalfIdx2 = HalfIdx;
12243 // Too many half vectors referenced.
12246 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12248 // Only shuffle the halves of the inputs when useful.
12249 int NumLowerHalves =
12250 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12251 int NumUpperHalves =
12252 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12254 // uuuuXXXX - don't extract uppers just to insert again.
12255 if (UndefLower && NumUpperHalves != 0)
12258 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12259 if (UndefUpper && NumUpperHalves == 2)
12262 // AVX2 - XXXXuuuu - always extract lowers.
12263 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12264 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12265 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12267 // AVX2 supports variable 32-bit element cross-lane shuffles.
12268 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12269 // XXXXuuuu - don't extract lowers and uppers.
12270 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12275 auto GetHalfVector = [&](int HalfIdx) {
12277 return DAG.getUNDEF(HalfVT);
12278 SDValue V = (HalfIdx < 2 ? V1 : V2);
12279 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12280 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12281 DAG.getIntPtrConstant(HalfIdx, DL));
12284 SDValue Half1 = GetHalfVector(HalfIdx1);
12285 SDValue Half2 = GetHalfVector(HalfIdx2);
12286 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12287 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12288 DAG.getIntPtrConstant(Offset, DL));
12291 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12294 /// This returns true if the elements from a particular input are already in the
12295 /// slot required by the given mask and require no permutation.
12296 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12297 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12298 int Size = Mask.size();
12299 for (int i = 0; i < Size; ++i)
12300 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12306 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12307 /// every lane can be represented as the same repeating mask - allowing us to
12308 /// shuffle the sources with the repeating shuffle and then permute the result
12309 /// to the destination lanes.
12310 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12311 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12312 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12313 int NumElts = VT.getVectorNumElements();
12314 int NumLanes = VT.getSizeInBits() / 128;
12315 int NumLaneElts = NumElts / NumLanes;
12317 // On AVX2 we may be able to just shuffle the lowest elements and then
12318 // broadcast the result.
12319 if (Subtarget.hasAVX2()) {
12320 for (unsigned BroadcastSize : {16, 32, 64}) {
12321 if (BroadcastSize <= VT.getScalarSizeInBits())
12323 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12325 // Attempt to match a repeating pattern every NumBroadcastElts,
12326 // accounting for UNDEFs but only references the lowest 128-bit
12327 // lane of the inputs.
12328 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12329 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12330 for (int j = 0; j != NumBroadcastElts; ++j) {
12331 int M = Mask[i + j];
12334 int &R = RepeatMask[j];
12335 if (0 != ((M % NumElts) / NumLaneElts))
12337 if (0 <= R && R != M)
12344 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12345 if (!FindRepeatingBroadcastMask(RepeatMask))
12348 // Shuffle the (lowest) repeated elements in place for broadcast.
12349 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12351 // Shuffle the actual broadcast.
12352 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12353 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12354 for (int j = 0; j != NumBroadcastElts; ++j)
12355 BroadcastMask[i + j] = j;
12356 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12361 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12362 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12365 // Bail if we already have a repeated lane shuffle mask.
12366 SmallVector<int, 8> RepeatedShuffleMask;
12367 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12370 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12371 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12372 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12373 int NumSubLanes = NumLanes * SubLaneScale;
12374 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12376 // Check that all the sources are coming from the same lane and see if we can
12377 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12378 // determine the source sub-lane for each destination sub-lane.
12379 int TopSrcSubLane = -1;
12380 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12381 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12382 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12383 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12385 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12386 // Extract the sub-lane mask, check that it all comes from the same lane
12387 // and normalize the mask entries to come from the first lane.
12389 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12390 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12391 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12394 int Lane = (M % NumElts) / NumLaneElts;
12395 if ((0 <= SrcLane) && (SrcLane != Lane))
12398 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12399 SubLaneMask[Elt] = LocalM;
12402 // Whole sub-lane is UNDEF.
12406 // Attempt to match against the candidate repeated sub-lane masks.
12407 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12408 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12409 for (int i = 0; i != NumSubLaneElts; ++i) {
12410 if (M1[i] < 0 || M2[i] < 0)
12412 if (M1[i] != M2[i])
12418 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12419 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12422 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12423 for (int i = 0; i != NumSubLaneElts; ++i) {
12424 int M = SubLaneMask[i];
12427 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12428 "Unexpected mask element");
12429 RepeatedSubLaneMask[i] = M;
12432 // Track the top most source sub-lane - by setting the remaining to UNDEF
12433 // we can greatly simplify shuffle matching.
12434 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12435 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12436 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12440 // Bail if we failed to find a matching repeated sub-lane mask.
12441 if (Dst2SrcSubLanes[DstSubLane] < 0)
12444 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12445 "Unexpected source lane");
12447 // Create a repeating shuffle mask for the entire vector.
12448 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12449 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12450 int Lane = SubLane / SubLaneScale;
12451 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12452 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12453 int M = RepeatedSubLaneMask[Elt];
12456 int Idx = (SubLane * NumSubLaneElts) + Elt;
12457 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12460 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12462 // Shuffle each source sub-lane to its destination.
12463 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12464 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12465 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12466 if (SrcSubLane < 0)
12468 for (int j = 0; j != NumSubLaneElts; ++j)
12469 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12472 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12476 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12477 unsigned &ShuffleImm,
12478 ArrayRef<int> Mask) {
12479 int NumElts = VT.getVectorNumElements();
12480 assert(VT.getScalarSizeInBits() == 64 &&
12481 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12482 "Unexpected data type for VSHUFPD");
12484 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12485 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12487 bool ShufpdMask = true;
12488 bool CommutableMask = true;
12489 for (int i = 0; i < NumElts; ++i) {
12490 if (Mask[i] == SM_SentinelUndef)
12494 int Val = (i & 6) + NumElts * (i & 1);
12495 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12496 if (Mask[i] < Val || Mask[i] > Val + 1)
12497 ShufpdMask = false;
12498 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12499 CommutableMask = false;
12500 ShuffleImm |= (Mask[i] % 2) << i;
12505 if (CommutableMask) {
12513 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12514 ArrayRef<int> Mask, SDValue V1,
12515 SDValue V2, SelectionDAG &DAG) {
12516 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12517 "Unexpected data type for VSHUFPD");
12519 unsigned Immediate = 0;
12520 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12523 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12524 DAG.getConstant(Immediate, DL, MVT::i8));
12527 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12528 ArrayRef<int> Mask, SDValue V1,
12529 SDValue V2, SelectionDAG &DAG) {
12530 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12531 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12533 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12535 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12537 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12540 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12542 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12543 /// isn't available.
12544 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12545 const APInt &Zeroable,
12546 SDValue V1, SDValue V2,
12547 const X86Subtarget &Subtarget,
12548 SelectionDAG &DAG) {
12549 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12550 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12551 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12553 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12554 Zeroable, Subtarget, DAG))
12557 if (V2.isUndef()) {
12558 // Check for being able to broadcast a single element.
12559 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12560 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12563 // Use low duplicate instructions for masks that match their pattern.
12564 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12565 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12567 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12568 // Non-half-crossing single input shuffles can be lowered with an
12569 // interleaved permutation.
12570 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12571 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12572 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12573 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12576 // With AVX2 we have direct support for this permutation.
12577 if (Subtarget.hasAVX2())
12578 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12579 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12581 // Try to create an in-lane repeating shuffle mask and then shuffle the
12582 // the results into the target lanes.
12583 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12584 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12587 // Otherwise, fall back.
12588 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12592 // Use dedicated unpack instructions for masks that match their pattern.
12594 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12597 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12598 Zeroable, Subtarget, DAG))
12601 // Check if the blend happens to exactly fit that of SHUFPD.
12603 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12606 // Try to create an in-lane repeating shuffle mask and then shuffle the
12607 // the results into the target lanes.
12608 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12609 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12612 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12613 // shuffle. However, if we have AVX2 and either inputs are already in place,
12614 // we will be able to shuffle even across lanes the other input in a single
12615 // instruction so skip this pattern.
12616 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12617 isShuffleMaskInputInPlace(1, Mask))))
12618 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12619 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12621 // If we have VLX support, we can use VEXPAND.
12622 if (Subtarget.hasVLX())
12623 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12624 V1, V2, DAG, Subtarget))
12627 // If we have AVX2 then we always want to lower with a blend because an v4 we
12628 // can fully permute the elements.
12629 if (Subtarget.hasAVX2())
12630 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12633 // Otherwise fall back on generic lowering.
12634 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12637 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12639 /// This routine is only called when we have AVX2 and thus a reasonable
12640 /// instruction set for v4i64 shuffling..
12641 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12642 const APInt &Zeroable,
12643 SDValue V1, SDValue V2,
12644 const X86Subtarget &Subtarget,
12645 SelectionDAG &DAG) {
12646 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12647 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12648 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12649 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12651 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12652 Zeroable, Subtarget, DAG))
12655 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12656 Zeroable, Subtarget, DAG))
12659 // Check for being able to broadcast a single element.
12660 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12661 Mask, Subtarget, DAG))
12664 if (V2.isUndef()) {
12665 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12666 // can use lower latency instructions that will operate on both lanes.
12667 SmallVector<int, 2> RepeatedMask;
12668 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12669 SmallVector<int, 4> PSHUFDMask;
12670 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12671 return DAG.getBitcast(
12673 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12674 DAG.getBitcast(MVT::v8i32, V1),
12675 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12678 // AVX2 provides a direct instruction for permuting a single input across
12680 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12681 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12684 // Try to use shift instructions.
12685 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12686 Zeroable, Subtarget, DAG))
12689 // If we have VLX support, we can use VALIGN or VEXPAND.
12690 if (Subtarget.hasVLX()) {
12691 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12692 Mask, Subtarget, DAG))
12695 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12696 V1, V2, DAG, Subtarget))
12700 // Try to use PALIGNR.
12701 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12702 Mask, Subtarget, DAG))
12705 // Use dedicated unpack instructions for masks that match their pattern.
12707 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12710 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12711 // shuffle. However, if we have AVX2 and either inputs are already in place,
12712 // we will be able to shuffle even across lanes the other input in a single
12713 // instruction so skip this pattern.
12714 if (!isShuffleMaskInputInPlace(0, Mask) &&
12715 !isShuffleMaskInputInPlace(1, Mask))
12716 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12717 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12720 // Otherwise fall back on generic blend lowering.
12721 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12725 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12727 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12728 /// isn't available.
12729 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12730 const APInt &Zeroable,
12731 SDValue V1, SDValue V2,
12732 const X86Subtarget &Subtarget,
12733 SelectionDAG &DAG) {
12734 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12735 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12736 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12738 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12739 Zeroable, Subtarget, DAG))
12742 // Check for being able to broadcast a single element.
12743 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12744 Mask, Subtarget, DAG))
12747 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12748 // options to efficiently lower the shuffle.
12749 SmallVector<int, 4> RepeatedMask;
12750 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12751 assert(RepeatedMask.size() == 4 &&
12752 "Repeated masks must be half the mask width!");
12754 // Use even/odd duplicate instructions for masks that match their pattern.
12755 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12756 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12757 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12758 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12761 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12762 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12764 // Use dedicated unpack instructions for masks that match their pattern.
12766 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12769 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12770 // have already handled any direct blends.
12771 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12774 // Try to create an in-lane repeating shuffle mask and then shuffle the
12775 // the results into the target lanes.
12776 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12777 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12780 // If we have a single input shuffle with different shuffle patterns in the
12781 // two 128-bit lanes use the variable mask to VPERMILPS.
12782 if (V2.isUndef()) {
12783 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12784 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12785 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12787 if (Subtarget.hasAVX2())
12788 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12790 // Otherwise, fall back.
12791 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12795 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12797 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12798 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12800 // If we have VLX support, we can use VEXPAND.
12801 if (Subtarget.hasVLX())
12802 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12803 V1, V2, DAG, Subtarget))
12806 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12807 // since after split we get a more efficient code using vpunpcklwd and
12808 // vpunpckhwd instrs than vblend.
12809 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12810 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12814 // If we have AVX2 then we always want to lower with a blend because at v8 we
12815 // can fully permute the elements.
12816 if (Subtarget.hasAVX2())
12817 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12820 // Otherwise fall back on generic lowering.
12821 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12824 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12826 /// This routine is only called when we have AVX2 and thus a reasonable
12827 /// instruction set for v8i32 shuffling..
12828 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12829 const APInt &Zeroable,
12830 SDValue V1, SDValue V2,
12831 const X86Subtarget &Subtarget,
12832 SelectionDAG &DAG) {
12833 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12834 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12835 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12836 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12838 // Whenever we can lower this as a zext, that instruction is strictly faster
12839 // than any alternative. It also allows us to fold memory operands into the
12840 // shuffle in many cases.
12841 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12842 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12845 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12846 // since after split we get a more efficient code than vblend by using
12847 // vpunpcklwd and vpunpckhwd instrs.
12848 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12849 !Subtarget.hasAVX512())
12851 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12854 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12855 Zeroable, Subtarget, DAG))
12858 // Check for being able to broadcast a single element.
12859 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12860 Mask, Subtarget, DAG))
12863 // If the shuffle mask is repeated in each 128-bit lane we can use more
12864 // efficient instructions that mirror the shuffles across the two 128-bit
12866 SmallVector<int, 4> RepeatedMask;
12867 bool Is128BitLaneRepeatedShuffle =
12868 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12869 if (Is128BitLaneRepeatedShuffle) {
12870 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12872 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12873 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12875 // Use dedicated unpack instructions for masks that match their pattern.
12877 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12881 // Try to use shift instructions.
12882 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12883 Zeroable, Subtarget, DAG))
12886 // If we have VLX support, we can use VALIGN or EXPAND.
12887 if (Subtarget.hasVLX()) {
12888 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12889 Mask, Subtarget, DAG))
12892 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12893 V1, V2, DAG, Subtarget))
12897 // Try to use byte rotation instructions.
12898 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12899 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12902 // Try to create an in-lane repeating shuffle mask and then shuffle the
12903 // results into the target lanes.
12904 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12905 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12908 // If the shuffle patterns aren't repeated but it is a single input, directly
12909 // generate a cross-lane VPERMD instruction.
12910 if (V2.isUndef()) {
12911 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12912 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12915 // Assume that a single SHUFPS is faster than an alternative sequence of
12916 // multiple instructions (even if the CPU has a domain penalty).
12917 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12918 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12919 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12920 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12921 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12922 CastV1, CastV2, DAG);
12923 return DAG.getBitcast(MVT::v8i32, ShufPS);
12926 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12928 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12929 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12932 // Otherwise fall back on generic blend lowering.
12933 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12937 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12939 /// This routine is only called when we have AVX2 and thus a reasonable
12940 /// instruction set for v16i16 shuffling..
12941 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12942 const APInt &Zeroable,
12943 SDValue V1, SDValue V2,
12944 const X86Subtarget &Subtarget,
12945 SelectionDAG &DAG) {
12946 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12947 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12948 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12949 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12951 // Whenever we can lower this as a zext, that instruction is strictly faster
12952 // than any alternative. It also allows us to fold memory operands into the
12953 // shuffle in many cases.
12954 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12955 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12958 // Check for being able to broadcast a single element.
12959 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12960 Mask, Subtarget, DAG))
12963 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12964 Zeroable, Subtarget, DAG))
12967 // Use dedicated unpack instructions for masks that match their pattern.
12969 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12972 // Try to use shift instructions.
12973 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12974 Zeroable, Subtarget, DAG))
12977 // Try to use byte rotation instructions.
12978 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12979 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12982 // Try to create an in-lane repeating shuffle mask and then shuffle the
12983 // the results into the target lanes.
12984 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12985 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12988 if (V2.isUndef()) {
12989 // There are no generalized cross-lane shuffle operations available on i16
12991 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12992 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12995 SmallVector<int, 8> RepeatedMask;
12996 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12997 // As this is a single-input shuffle, the repeated mask should be
12998 // a strictly valid v8i16 mask that we can pass through to the v8i16
12999 // lowering to handle even the v16 case.
13000 return lowerV8I16GeneralSingleInputVectorShuffle(
13001 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13005 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13006 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13009 // AVX512BWVL can lower to VPERMW.
13010 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13011 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13013 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13015 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13016 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13019 // Otherwise fall back on generic lowering.
13020 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13023 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13025 /// This routine is only called when we have AVX2 and thus a reasonable
13026 /// instruction set for v32i8 shuffling..
13027 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13028 const APInt &Zeroable,
13029 SDValue V1, SDValue V2,
13030 const X86Subtarget &Subtarget,
13031 SelectionDAG &DAG) {
13032 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13033 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13034 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13035 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13037 // Whenever we can lower this as a zext, that instruction is strictly faster
13038 // than any alternative. It also allows us to fold memory operands into the
13039 // shuffle in many cases.
13040 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13041 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13044 // Check for being able to broadcast a single element.
13045 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13046 Mask, Subtarget, DAG))
13049 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13050 Zeroable, Subtarget, DAG))
13053 // Use dedicated unpack instructions for masks that match their pattern.
13055 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13058 // Try to use shift instructions.
13059 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13060 Zeroable, Subtarget, DAG))
13063 // Try to use byte rotation instructions.
13064 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13065 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13068 // Try to create an in-lane repeating shuffle mask and then shuffle the
13069 // the results into the target lanes.
13070 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13071 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13074 // There are no generalized cross-lane shuffle operations available on i8
13076 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13077 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13080 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13081 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13084 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13086 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13087 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13090 // Otherwise fall back on generic lowering.
13091 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13094 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13096 /// This routine either breaks down the specific type of a 256-bit x86 vector
13097 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13098 /// together based on the available instructions.
13099 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13100 MVT VT, SDValue V1, SDValue V2,
13101 const APInt &Zeroable,
13102 const X86Subtarget &Subtarget,
13103 SelectionDAG &DAG) {
13104 // If we have a single input to the zero element, insert that into V1 if we
13105 // can do so cheaply.
13106 int NumElts = VT.getVectorNumElements();
13107 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13109 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13110 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13111 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13114 // Handle special cases where the lower or upper half is UNDEF.
13116 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13119 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13120 // can check for those subtargets here and avoid much of the subtarget
13121 // querying in the per-vector-type lowering routines. With AVX1 we have
13122 // essentially *zero* ability to manipulate a 256-bit vector with integer
13123 // types. Since we'll use floating point types there eventually, just
13124 // immediately cast everything to a float and operate entirely in that domain.
13125 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13126 int ElementBits = VT.getScalarSizeInBits();
13127 if (ElementBits < 32) {
13128 // No floating point type available, if we can't use the bit operations
13129 // for masking/blending then decompose into 128-bit vectors.
13131 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13133 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13135 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13138 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13139 VT.getVectorNumElements());
13140 V1 = DAG.getBitcast(FpVT, V1);
13141 V2 = DAG.getBitcast(FpVT, V2);
13142 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13145 switch (VT.SimpleTy) {
13147 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13149 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13151 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13153 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13155 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13157 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13160 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13164 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13165 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13166 ArrayRef<int> Mask, SDValue V1,
13167 SDValue V2, SelectionDAG &DAG) {
13168 assert(VT.getScalarSizeInBits() == 64 &&
13169 "Unexpected element type size for 128bit shuffle.");
13171 // To handle 256 bit vector requires VLX and most probably
13172 // function lowerV2X128VectorShuffle() is better solution.
13173 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13175 SmallVector<int, 4> WidenedMask;
13176 if (!canWidenShuffleElements(Mask, WidenedMask))
13179 // Check for patterns which can be matched with a single insert of a 256-bit
13181 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13182 {0, 1, 2, 3, 0, 1, 2, 3});
13183 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13184 {0, 1, 2, 3, 8, 9, 10, 11})) {
13185 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13186 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13187 DAG.getIntPtrConstant(0, DL));
13188 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13189 OnlyUsesV1 ? V1 : V2,
13190 DAG.getIntPtrConstant(0, DL));
13191 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13194 assert(WidenedMask.size() == 4);
13196 // See if this is an insertion of the lower 128-bits of V2 into V1.
13197 bool IsInsert = true;
13199 for (int i = 0; i < 4; ++i) {
13200 assert(WidenedMask[i] >= -1);
13201 if (WidenedMask[i] < 0)
13204 // Make sure all V1 subvectors are in place.
13205 if (WidenedMask[i] < 4) {
13206 if (WidenedMask[i] != i) {
13211 // Make sure we only have a single V2 index and its the lowest 128-bits.
13212 if (V2Index >= 0 || WidenedMask[i] != 4) {
13219 if (IsInsert && V2Index >= 0) {
13220 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13221 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13222 DAG.getIntPtrConstant(0, DL));
13223 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13226 // Try to lower to to vshuf64x2/vshuf32x4.
13227 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13228 unsigned PermMask = 0;
13229 // Insure elements came from the same Op.
13230 for (int i = 0; i < 4; ++i) {
13231 assert(WidenedMask[i] >= -1);
13232 if (WidenedMask[i] < 0)
13235 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13236 unsigned OpIndex = i / 2;
13237 if (Ops[OpIndex].isUndef())
13239 else if (Ops[OpIndex] != Op)
13242 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13243 // bits defined by a vshuf64x2 instruction's immediate control byte.
13244 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13247 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13248 DAG.getConstant(PermMask, DL, MVT::i8));
13251 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13252 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13253 const APInt &Zeroable,
13254 SDValue V1, SDValue V2,
13255 const X86Subtarget &Subtarget,
13256 SelectionDAG &DAG) {
13257 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13258 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13259 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13261 if (V2.isUndef()) {
13262 // Use low duplicate instructions for masks that match their pattern.
13263 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13264 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13266 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13267 // Non-half-crossing single input shuffles can be lowered with an
13268 // interleaved permutation.
13269 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13270 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13271 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13272 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13273 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13274 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13277 SmallVector<int, 4> RepeatedMask;
13278 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13279 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13280 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13283 if (SDValue Shuf128 =
13284 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13287 if (SDValue Unpck =
13288 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13291 // Check if the blend happens to exactly fit that of SHUFPD.
13293 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13296 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13297 V2, DAG, Subtarget))
13300 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13301 Zeroable, Subtarget, DAG))
13304 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13307 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13308 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13309 const APInt &Zeroable,
13310 SDValue V1, SDValue V2,
13311 const X86Subtarget &Subtarget,
13312 SelectionDAG &DAG) {
13313 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13314 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13315 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13317 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13318 // options to efficiently lower the shuffle.
13319 SmallVector<int, 4> RepeatedMask;
13320 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13321 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13323 // Use even/odd duplicate instructions for masks that match their pattern.
13324 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13325 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13326 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13327 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13330 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13331 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13333 // Use dedicated unpack instructions for masks that match their pattern.
13334 if (SDValue Unpck =
13335 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13338 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13339 Zeroable, Subtarget, DAG))
13342 // Otherwise, fall back to a SHUFPS sequence.
13343 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13345 // If we have AVX512F support, we can use VEXPAND.
13346 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13347 V1, V2, DAG, Subtarget))
13350 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13353 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13354 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13355 const APInt &Zeroable,
13356 SDValue V1, SDValue V2,
13357 const X86Subtarget &Subtarget,
13358 SelectionDAG &DAG) {
13359 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13360 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13361 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13363 if (SDValue Shuf128 =
13364 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13367 if (V2.isUndef()) {
13368 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13369 // can use lower latency instructions that will operate on all four
13371 SmallVector<int, 2> Repeated128Mask;
13372 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13373 SmallVector<int, 4> PSHUFDMask;
13374 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13375 return DAG.getBitcast(
13377 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13378 DAG.getBitcast(MVT::v16i32, V1),
13379 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13382 SmallVector<int, 4> Repeated256Mask;
13383 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13384 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13385 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13388 // Try to use shift instructions.
13389 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13390 Zeroable, Subtarget, DAG))
13393 // Try to use VALIGN.
13394 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13395 Mask, Subtarget, DAG))
13398 // Try to use PALIGNR.
13399 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13400 Mask, Subtarget, DAG))
13403 if (SDValue Unpck =
13404 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13406 // If we have AVX512F support, we can use VEXPAND.
13407 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13408 V2, DAG, Subtarget))
13411 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13412 Zeroable, Subtarget, DAG))
13415 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13418 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13419 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13420 const APInt &Zeroable,
13421 SDValue V1, SDValue V2,
13422 const X86Subtarget &Subtarget,
13423 SelectionDAG &DAG) {
13424 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13425 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13426 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13428 // Whenever we can lower this as a zext, that instruction is strictly faster
13429 // than any alternative. It also allows us to fold memory operands into the
13430 // shuffle in many cases.
13431 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13432 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13435 // If the shuffle mask is repeated in each 128-bit lane we can use more
13436 // efficient instructions that mirror the shuffles across the four 128-bit
13438 SmallVector<int, 4> RepeatedMask;
13439 bool Is128BitLaneRepeatedShuffle =
13440 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13441 if (Is128BitLaneRepeatedShuffle) {
13442 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13444 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13447 // Use dedicated unpack instructions for masks that match their pattern.
13449 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13453 // Try to use shift instructions.
13454 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13455 Zeroable, Subtarget, DAG))
13458 // Try to use VALIGN.
13459 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13460 Mask, Subtarget, DAG))
13463 // Try to use byte rotation instructions.
13464 if (Subtarget.hasBWI())
13465 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13466 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13469 // Assume that a single SHUFPS is faster than using a permv shuffle.
13470 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13471 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13472 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13473 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13474 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13475 CastV1, CastV2, DAG);
13476 return DAG.getBitcast(MVT::v16i32, ShufPS);
13478 // If we have AVX512F support, we can use VEXPAND.
13479 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13480 V1, V2, DAG, Subtarget))
13483 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13484 Zeroable, Subtarget, DAG))
13486 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13489 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13490 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13491 const APInt &Zeroable,
13492 SDValue V1, SDValue V2,
13493 const X86Subtarget &Subtarget,
13494 SelectionDAG &DAG) {
13495 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13496 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13497 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13498 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13500 // Whenever we can lower this as a zext, that instruction is strictly faster
13501 // than any alternative. It also allows us to fold memory operands into the
13502 // shuffle in many cases.
13503 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13504 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13507 // Use dedicated unpack instructions for masks that match their pattern.
13509 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13512 // Try to use shift instructions.
13513 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13514 Zeroable, Subtarget, DAG))
13517 // Try to use byte rotation instructions.
13518 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13519 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13522 if (V2.isUndef()) {
13523 SmallVector<int, 8> RepeatedMask;
13524 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13525 // As this is a single-input shuffle, the repeated mask should be
13526 // a strictly valid v8i16 mask that we can pass through to the v8i16
13527 // lowering to handle even the v32 case.
13528 return lowerV8I16GeneralSingleInputVectorShuffle(
13529 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13533 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13534 Zeroable, Subtarget, DAG))
13537 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13540 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13541 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13542 const APInt &Zeroable,
13543 SDValue V1, SDValue V2,
13544 const X86Subtarget &Subtarget,
13545 SelectionDAG &DAG) {
13546 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13547 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13548 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13549 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13551 // Whenever we can lower this as a zext, that instruction is strictly faster
13552 // than any alternative. It also allows us to fold memory operands into the
13553 // shuffle in many cases.
13554 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13555 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13558 // Use dedicated unpack instructions for masks that match their pattern.
13560 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13563 // Try to use shift instructions.
13564 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13565 Zeroable, Subtarget, DAG))
13568 // Try to use byte rotation instructions.
13569 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13570 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13573 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13574 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13577 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13578 if (Subtarget.hasVBMI())
13579 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13581 // Try to create an in-lane repeating shuffle mask and then shuffle the
13582 // the results into the target lanes.
13583 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13584 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13587 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13588 Zeroable, Subtarget, DAG))
13591 // FIXME: Implement direct support for this type!
13592 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13595 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13597 /// This routine either breaks down the specific type of a 512-bit x86 vector
13598 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13599 /// together based on the available instructions.
13600 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13601 MVT VT, SDValue V1, SDValue V2,
13602 const APInt &Zeroable,
13603 const X86Subtarget &Subtarget,
13604 SelectionDAG &DAG) {
13605 assert(Subtarget.hasAVX512() &&
13606 "Cannot lower 512-bit vectors w/ basic ISA!");
13608 // If we have a single input to the zero element, insert that into V1 if we
13609 // can do so cheaply.
13610 int NumElts = Mask.size();
13611 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13613 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13614 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13615 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13618 // Check for being able to broadcast a single element.
13619 if (SDValue Broadcast =
13620 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13623 // Dispatch to each element type for lowering. If we don't have support for
13624 // specific element type shuffles at 512 bits, immediately split them and
13625 // lower them. Each lowering routine of a given type is allowed to assume that
13626 // the requisite ISA extensions for that element type are available.
13627 switch (VT.SimpleTy) {
13629 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13631 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13633 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13635 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13637 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13639 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13642 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13646 // Lower vXi1 vector shuffles.
13647 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13648 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13649 // vector, shuffle and then truncate it back.
13650 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13651 MVT VT, SDValue V1, SDValue V2,
13652 const X86Subtarget &Subtarget,
13653 SelectionDAG &DAG) {
13654 assert(Subtarget.hasAVX512() &&
13655 "Cannot lower 512-bit vectors w/o basic ISA!");
13657 switch (VT.SimpleTy) {
13659 llvm_unreachable("Expected a vector of i1 elements");
13661 ExtVT = MVT::v2i64;
13664 ExtVT = MVT::v4i32;
13667 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13670 ExtVT = MVT::v16i32;
13673 ExtVT = MVT::v32i16;
13676 ExtVT = MVT::v64i8;
13680 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13681 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13682 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13683 V1 = getOnesVector(ExtVT, DAG, DL);
13685 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13688 V2 = DAG.getUNDEF(ExtVT);
13689 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13690 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13691 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13692 V2 = getOnesVector(ExtVT, DAG, DL);
13694 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13696 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13697 // i1 was sign extended we can use X86ISD::CVT2MASK.
13698 int NumElems = VT.getVectorNumElements();
13699 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13700 (Subtarget.hasDQI() && (NumElems < 32)))
13701 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13703 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13706 /// Helper function that returns true if the shuffle mask should be
13707 /// commuted to improve canonicalization.
13708 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13709 int NumElements = Mask.size();
13711 int NumV1Elements = 0, NumV2Elements = 0;
13715 else if (M < NumElements)
13720 // Commute the shuffle as needed such that more elements come from V1 than
13721 // V2. This allows us to match the shuffle pattern strictly on how many
13722 // elements come from V1 without handling the symmetric cases.
13723 if (NumV2Elements > NumV1Elements)
13726 assert(NumV1Elements > 0 && "No V1 indices");
13728 if (NumV2Elements == 0)
13731 // When the number of V1 and V2 elements are the same, try to minimize the
13732 // number of uses of V2 in the low half of the vector. When that is tied,
13733 // ensure that the sum of indices for V1 is equal to or lower than the sum
13734 // indices for V2. When those are equal, try to ensure that the number of odd
13735 // indices for V1 is lower than the number of odd indices for V2.
13736 if (NumV1Elements == NumV2Elements) {
13737 int LowV1Elements = 0, LowV2Elements = 0;
13738 for (int M : Mask.slice(0, NumElements / 2))
13739 if (M >= NumElements)
13743 if (LowV2Elements > LowV1Elements)
13745 if (LowV2Elements == LowV1Elements) {
13746 int SumV1Indices = 0, SumV2Indices = 0;
13747 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13748 if (Mask[i] >= NumElements)
13750 else if (Mask[i] >= 0)
13752 if (SumV2Indices < SumV1Indices)
13754 if (SumV2Indices == SumV1Indices) {
13755 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13756 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13757 if (Mask[i] >= NumElements)
13758 NumV2OddIndices += i % 2;
13759 else if (Mask[i] >= 0)
13760 NumV1OddIndices += i % 2;
13761 if (NumV2OddIndices < NumV1OddIndices)
13770 /// \brief Top-level lowering for x86 vector shuffles.
13772 /// This handles decomposition, canonicalization, and lowering of all x86
13773 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13774 /// above in helper routines. The canonicalization attempts to widen shuffles
13775 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13776 /// s.t. only one of the two inputs needs to be tested, etc.
13777 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13778 SelectionDAG &DAG) {
13779 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13780 ArrayRef<int> Mask = SVOp->getMask();
13781 SDValue V1 = Op.getOperand(0);
13782 SDValue V2 = Op.getOperand(1);
13783 MVT VT = Op.getSimpleValueType();
13784 int NumElements = VT.getVectorNumElements();
13786 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13788 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13789 "Can't lower MMX shuffles");
13791 bool V1IsUndef = V1.isUndef();
13792 bool V2IsUndef = V2.isUndef();
13793 if (V1IsUndef && V2IsUndef)
13794 return DAG.getUNDEF(VT);
13796 // When we create a shuffle node we put the UNDEF node to second operand,
13797 // but in some cases the first operand may be transformed to UNDEF.
13798 // In this case we should just commute the node.
13800 return DAG.getCommutedVectorShuffle(*SVOp);
13802 // Check for non-undef masks pointing at an undef vector and make the masks
13803 // undef as well. This makes it easier to match the shuffle based solely on
13807 if (M >= NumElements) {
13808 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13809 for (int &M : NewMask)
13810 if (M >= NumElements)
13812 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13815 // Check for illegal shuffle mask element index values.
13816 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13817 assert(llvm::all_of(Mask,
13818 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13819 "Out of bounds shuffle index");
13821 // We actually see shuffles that are entirely re-arrangements of a set of
13822 // zero inputs. This mostly happens while decomposing complex shuffles into
13823 // simple ones. Directly lower these as a buildvector of zeros.
13824 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13825 if (Zeroable.isAllOnesValue())
13826 return getZeroVector(VT, Subtarget, DAG, DL);
13828 // Try to collapse shuffles into using a vector type with fewer elements but
13829 // wider element types. We cap this to not form integers or floating point
13830 // elements wider than 64 bits, but it might be interesting to form i128
13831 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13832 SmallVector<int, 16> WidenedMask;
13833 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13834 canWidenShuffleElements(Mask, WidenedMask)) {
13835 MVT NewEltVT = VT.isFloatingPoint()
13836 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13837 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13838 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13839 // Make sure that the new vector type is legal. For example, v2f64 isn't
13841 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13842 V1 = DAG.getBitcast(NewVT, V1);
13843 V2 = DAG.getBitcast(NewVT, V2);
13844 return DAG.getBitcast(
13845 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13849 // Commute the shuffle if it will improve canonicalization.
13850 if (canonicalizeShuffleMaskWithCommute(Mask))
13851 return DAG.getCommutedVectorShuffle(*SVOp);
13853 // For each vector width, delegate to a specialized lowering routine.
13854 if (VT.is128BitVector())
13855 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13858 if (VT.is256BitVector())
13859 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13862 if (VT.is512BitVector())
13863 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13867 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13869 llvm_unreachable("Unimplemented!");
13872 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13873 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13874 const X86Subtarget &Subtarget,
13875 SelectionDAG &DAG) {
13876 SDValue Cond = Op.getOperand(0);
13877 SDValue LHS = Op.getOperand(1);
13878 SDValue RHS = Op.getOperand(2);
13880 MVT VT = Op.getSimpleValueType();
13882 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13884 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13886 // Only non-legal VSELECTs reach this lowering, convert those into generic
13887 // shuffles and re-use the shuffle lowering path for blends.
13888 SmallVector<int, 32> Mask;
13889 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13890 SDValue CondElt = CondBV->getOperand(i);
13892 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13895 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13898 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13899 // A vselect where all conditions and data are constants can be optimized into
13900 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13901 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13902 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13903 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13906 // Try to lower this to a blend-style vector shuffle. This can handle all
13907 // constant condition cases.
13908 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13911 // Variable blends are only legal from SSE4.1 onward.
13912 if (!Subtarget.hasSSE41())
13915 // Only some types will be legal on some subtargets. If we can emit a legal
13916 // VSELECT-matching blend, return Op, and but if we need to expand, return
13918 switch (Op.getSimpleValueType().SimpleTy) {
13920 // Most of the vector types have blends past SSE4.1.
13924 // The byte blends for AVX vectors were introduced only in AVX2.
13925 if (Subtarget.hasAVX2())
13932 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13933 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13936 // FIXME: We should custom lower this by fixing the condition and using i8
13942 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13943 MVT VT = Op.getSimpleValueType();
13946 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13949 if (VT.getSizeInBits() == 8) {
13950 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13951 Op.getOperand(0), Op.getOperand(1));
13952 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13953 DAG.getValueType(VT));
13954 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13957 if (VT == MVT::f32) {
13958 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13959 // the result back to FR32 register. It's only worth matching if the
13960 // result has a single use which is a store or a bitcast to i32. And in
13961 // the case of a store, it's not worth it if the index is a constant 0,
13962 // because a MOVSSmr can be used instead, which is smaller and faster.
13963 if (!Op.hasOneUse())
13965 SDNode *User = *Op.getNode()->use_begin();
13966 if ((User->getOpcode() != ISD::STORE ||
13967 isNullConstant(Op.getOperand(1))) &&
13968 (User->getOpcode() != ISD::BITCAST ||
13969 User->getValueType(0) != MVT::i32))
13971 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13972 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13974 return DAG.getBitcast(MVT::f32, Extract);
13977 if (VT == MVT::i32 || VT == MVT::i64) {
13978 // ExtractPS/pextrq works with constant index.
13979 if (isa<ConstantSDNode>(Op.getOperand(1)))
13986 /// Extract one bit from mask vector, like v16i1 or v8i1.
13987 /// AVX-512 feature.
13989 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13990 SDValue Vec = Op.getOperand(0);
13992 MVT VecVT = Vec.getSimpleValueType();
13993 SDValue Idx = Op.getOperand(1);
13994 MVT EltVT = Op.getSimpleValueType();
13996 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13997 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13998 "Unexpected vector type in ExtractBitFromMaskVector");
14000 // variable index can't be handled in mask registers,
14001 // extend vector to VR512/128
14002 if (!isa<ConstantSDNode>(Idx)) {
14003 unsigned NumElts = VecVT.getVectorNumElements();
14004 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14005 // than extending to 128/256bit.
14006 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14007 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14008 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14009 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14010 ExtVT.getVectorElementType(), Ext, Idx);
14011 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14014 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14015 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14016 (VecVT.getVectorNumElements() < 8)) {
14017 // Use kshiftlw/rw instruction.
14018 VecVT = MVT::v16i1;
14019 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14020 DAG.getUNDEF(VecVT),
14022 DAG.getIntPtrConstant(0, dl));
14024 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14025 if (MaxSift - IdxVal)
14026 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14027 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14028 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14029 DAG.getConstant(MaxSift, dl, MVT::i8));
14030 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
14031 DAG.getIntPtrConstant(0, dl));
14035 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14036 SelectionDAG &DAG) const {
14038 SDValue Vec = Op.getOperand(0);
14039 MVT VecVT = Vec.getSimpleValueType();
14040 SDValue Idx = Op.getOperand(1);
14042 if (Op.getSimpleValueType() == MVT::i1)
14043 return ExtractBitFromMaskVector(Op, DAG);
14045 if (!isa<ConstantSDNode>(Idx)) {
14046 // Its more profitable to go through memory (1 cycles throughput)
14047 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14048 // IACA tool was used to get performance estimation
14049 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14051 // example : extractelement <16 x i8> %a, i32 %i
14053 // Block Throughput: 3.00 Cycles
14054 // Throughput Bottleneck: Port5
14056 // | Num Of | Ports pressure in cycles | |
14057 // | Uops | 0 - DV | 5 | 6 | 7 | |
14058 // ---------------------------------------------
14059 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14060 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14061 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14062 // Total Num Of Uops: 4
14065 // Block Throughput: 1.00 Cycles
14066 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14068 // | | Ports pressure in cycles | |
14069 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14070 // ---------------------------------------------------------
14071 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14072 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14073 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14074 // Total Num Of Uops: 4
14079 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14081 // If this is a 256-bit vector result, first extract the 128-bit vector and
14082 // then extract the element from the 128-bit vector.
14083 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14084 // Get the 128-bit vector.
14085 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14086 MVT EltVT = VecVT.getVectorElementType();
14088 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14089 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14091 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14092 // this can be done with a mask.
14093 IdxVal &= ElemsPerChunk - 1;
14094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14095 DAG.getConstant(IdxVal, dl, MVT::i32));
14098 assert(VecVT.is128BitVector() && "Unexpected vector length");
14100 MVT VT = Op.getSimpleValueType();
14102 if (VT.getSizeInBits() == 16) {
14103 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14104 // we're going to zero extend the register or fold the store (SSE41 only).
14105 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14106 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14107 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14108 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14109 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14111 // Transform it so it match pextrw which produces a 32-bit result.
14112 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14113 Op.getOperand(0), Op.getOperand(1));
14114 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14115 DAG.getValueType(VT));
14116 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14119 if (Subtarget.hasSSE41())
14120 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14123 // TODO: We only extract a single element from v16i8, we can probably afford
14124 // to be more aggressive here before using the default approach of spilling to
14126 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14127 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14128 int DWordIdx = IdxVal / 4;
14129 if (DWordIdx == 0) {
14130 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14131 DAG.getBitcast(MVT::v4i32, Vec),
14132 DAG.getIntPtrConstant(DWordIdx, dl));
14133 int ShiftVal = (IdxVal % 4) * 8;
14135 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14136 DAG.getConstant(ShiftVal, dl, MVT::i32));
14137 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14140 int WordIdx = IdxVal / 2;
14141 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14142 DAG.getBitcast(MVT::v8i16, Vec),
14143 DAG.getIntPtrConstant(WordIdx, dl));
14144 int ShiftVal = (IdxVal % 2) * 8;
14146 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14147 DAG.getConstant(ShiftVal, dl, MVT::i16));
14148 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14151 if (VT.getSizeInBits() == 32) {
14155 // SHUFPS the element to the lowest double word, then movss.
14156 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14157 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14158 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14159 DAG.getIntPtrConstant(0, dl));
14162 if (VT.getSizeInBits() == 64) {
14163 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14164 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14165 // to match extract_elt for f64.
14169 // UNPCKHPD the element to the lowest double word, then movsd.
14170 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14171 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14172 int Mask[2] = { 1, -1 };
14173 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14175 DAG.getIntPtrConstant(0, dl));
14181 /// Insert one bit to mask vector, like v16i1 or v8i1.
14182 /// AVX-512 feature.
14184 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14186 SDValue Vec = Op.getOperand(0);
14187 SDValue Elt = Op.getOperand(1);
14188 SDValue Idx = Op.getOperand(2);
14189 MVT VecVT = Vec.getSimpleValueType();
14191 if (!isa<ConstantSDNode>(Idx)) {
14192 // Non constant index. Extend source and destination,
14193 // insert element and then truncate the result.
14194 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14195 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14196 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14197 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14198 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14199 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14202 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14203 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14204 unsigned NumElems = VecVT.getVectorNumElements();
14206 if(Vec.isUndef()) {
14208 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14209 DAG.getConstant(IdxVal, dl, MVT::i8));
14213 // Insertion of one bit into first or last position
14214 // can be done with two SHIFTs + OR.
14215 if (IdxVal == 0 ) {
14216 // EltInVec already at correct index and other bits are 0.
14217 // Clean the first bit in source vector.
14218 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14219 DAG.getConstant(1 , dl, MVT::i8));
14220 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14221 DAG.getConstant(1, dl, MVT::i8));
14223 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14225 if (IdxVal == NumElems -1) {
14226 // Move the bit to the last position inside the vector.
14227 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14228 DAG.getConstant(IdxVal, dl, MVT::i8));
14229 // Clean the last bit in the source vector.
14230 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14231 DAG.getConstant(1, dl, MVT::i8));
14232 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14233 DAG.getConstant(1 , dl, MVT::i8));
14235 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14238 // Use shuffle to insert element.
14239 SmallVector<int, 64> MaskVec(NumElems);
14240 for (unsigned i = 0; i != NumElems; ++i)
14241 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14243 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14246 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14247 SelectionDAG &DAG) const {
14248 MVT VT = Op.getSimpleValueType();
14249 MVT EltVT = VT.getVectorElementType();
14250 unsigned NumElts = VT.getVectorNumElements();
14252 if (EltVT == MVT::i1)
14253 return InsertBitToMaskVector(Op, DAG);
14256 SDValue N0 = Op.getOperand(0);
14257 SDValue N1 = Op.getOperand(1);
14258 SDValue N2 = Op.getOperand(2);
14259 if (!isa<ConstantSDNode>(N2))
14261 auto *N2C = cast<ConstantSDNode>(N2);
14262 unsigned IdxVal = N2C->getZExtValue();
14264 bool IsZeroElt = X86::isZeroNode(N1);
14265 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14267 // If we are inserting a element, see if we can do this more efficiently with
14268 // a blend shuffle with a rematerializable vector than a costly integer
14270 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14271 // be beneficial if we are inserting several zeros and can combine the masks.
14272 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14273 SmallVector<int, 8> BlendMask;
14274 for (unsigned i = 0; i != NumElts; ++i)
14275 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14276 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14277 : DAG.getConstant(-1, dl, VT);
14278 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14281 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14282 // into that, and then insert the subvector back into the result.
14283 if (VT.is256BitVector() || VT.is512BitVector()) {
14284 // With a 256-bit vector, we can insert into the zero element efficiently
14285 // using a blend if we have AVX or AVX2 and the right data type.
14286 if (VT.is256BitVector() && IdxVal == 0) {
14287 // TODO: It is worthwhile to cast integer to floating point and back
14288 // and incur a domain crossing penalty if that's what we'll end up
14289 // doing anyway after extracting to a 128-bit vector.
14290 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14291 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14292 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14293 N2 = DAG.getIntPtrConstant(1, dl);
14294 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14298 // Get the desired 128-bit vector chunk.
14299 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14301 // Insert the element into the desired chunk.
14302 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14303 assert(isPowerOf2_32(NumEltsIn128));
14304 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14305 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14307 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14308 DAG.getConstant(IdxIn128, dl, MVT::i32));
14310 // Insert the changed part back into the bigger vector
14311 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14313 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14315 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14316 // argument. SSE41 required for pinsrb.
14317 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14319 if (VT == MVT::v8i16) {
14320 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14321 Opc = X86ISD::PINSRW;
14323 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14324 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14325 Opc = X86ISD::PINSRB;
14328 if (N1.getValueType() != MVT::i32)
14329 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14330 if (N2.getValueType() != MVT::i32)
14331 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14332 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14335 if (Subtarget.hasSSE41()) {
14336 if (EltVT == MVT::f32) {
14337 // Bits [7:6] of the constant are the source select. This will always be
14338 // zero here. The DAG Combiner may combine an extract_elt index into
14339 // these bits. For example (insert (extract, 3), 2) could be matched by
14340 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14341 // Bits [5:4] of the constant are the destination select. This is the
14342 // value of the incoming immediate.
14343 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14344 // combine either bitwise AND or insert of float 0.0 to set these bits.
14346 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14347 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14348 // If this is an insertion of 32-bits into the low 32-bits of
14349 // a vector, we prefer to generate a blend with immediate rather
14350 // than an insertps. Blends are simpler operations in hardware and so
14351 // will always have equal or better performance than insertps.
14352 // But if optimizing for size and there's a load folding opportunity,
14353 // generate insertps because blendps does not have a 32-bit memory
14355 N2 = DAG.getIntPtrConstant(1, dl);
14356 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14357 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14359 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14360 // Create this as a scalar to vector..
14361 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14362 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14365 // PINSR* works with constant index.
14366 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14373 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14374 SelectionDAG &DAG) {
14376 MVT OpVT = Op.getSimpleValueType();
14378 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14380 if (X86::isZeroNode(Op.getOperand(0)))
14381 return getZeroVector(OpVT, Subtarget, DAG, dl);
14383 // If this is a 256-bit vector result, first insert into a 128-bit
14384 // vector and then insert into the 256-bit vector.
14385 if (!OpVT.is128BitVector()) {
14386 // Insert into a 128-bit vector.
14387 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14388 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14389 OpVT.getVectorNumElements() / SizeFactor);
14391 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14393 // Insert the 128-bit vector.
14394 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14396 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14398 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14399 if (OpVT == MVT::v4i32)
14402 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14403 return DAG.getBitcast(
14404 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14407 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14408 // a simple subregister reference or explicit instructions to grab
14409 // upper bits of a vector.
14410 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14411 SelectionDAG &DAG) {
14412 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14415 SDValue In = Op.getOperand(0);
14416 SDValue Idx = Op.getOperand(1);
14417 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14418 MVT ResVT = Op.getSimpleValueType();
14420 assert((In.getSimpleValueType().is256BitVector() ||
14421 In.getSimpleValueType().is512BitVector()) &&
14422 "Can only extract from 256-bit or 512-bit vectors");
14424 // If the input is a buildvector just emit a smaller one.
14425 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14426 if (In.getOpcode() == ISD::BUILD_VECTOR)
14427 return DAG.getBuildVector(
14428 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14430 // Everything else is legal.
14434 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14435 // simple superregister reference or explicit instructions to insert
14436 // the upper bits of a vector.
14437 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14438 SelectionDAG &DAG) {
14439 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14441 return insert1BitVector(Op, DAG, Subtarget);
14444 // Returns the appropriate wrapper opcode for a global reference.
14445 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14446 // References to absolute symbols are never PC-relative.
14447 if (GV && GV->isAbsoluteSymbolRef())
14448 return X86ISD::Wrapper;
14450 CodeModel::Model M = getTargetMachine().getCodeModel();
14451 if (Subtarget.isPICStyleRIPRel() &&
14452 (M == CodeModel::Small || M == CodeModel::Kernel))
14453 return X86ISD::WrapperRIP;
14455 return X86ISD::Wrapper;
14458 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14459 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14460 // one of the above mentioned nodes. It has to be wrapped because otherwise
14461 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14462 // be used to form addressing mode. These wrapped nodes will be selected
14465 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14466 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14468 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14469 // global base reg.
14470 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14472 auto PtrVT = getPointerTy(DAG.getDataLayout());
14473 SDValue Result = DAG.getTargetConstantPool(
14474 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14476 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14477 // With PIC, the address is actually $g + Offset.
14480 DAG.getNode(ISD::ADD, DL, PtrVT,
14481 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14487 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14488 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14490 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14491 // global base reg.
14492 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14494 auto PtrVT = getPointerTy(DAG.getDataLayout());
14495 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14497 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14499 // With PIC, the address is actually $g + Offset.
14502 DAG.getNode(ISD::ADD, DL, PtrVT,
14503 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14509 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14510 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14512 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14513 // global base reg.
14514 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14515 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14517 auto PtrVT = getPointerTy(DAG.getDataLayout());
14518 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14521 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14523 // With PIC, the address is actually $g + Offset.
14524 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14526 DAG.getNode(ISD::ADD, DL, PtrVT,
14527 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14530 // For symbols that require a load from a stub to get the address, emit the
14532 if (isGlobalStubReference(OpFlag))
14533 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14534 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14540 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14541 // Create the TargetBlockAddressAddress node.
14542 unsigned char OpFlags =
14543 Subtarget.classifyBlockAddressReference();
14544 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14545 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14547 auto PtrVT = getPointerTy(DAG.getDataLayout());
14548 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14549 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14551 // With PIC, the address is actually $g + Offset.
14552 if (isGlobalRelativeToPICBase(OpFlags)) {
14553 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14554 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14560 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14561 const SDLoc &dl, int64_t Offset,
14562 SelectionDAG &DAG) const {
14563 // Create the TargetGlobalAddress node, folding in the constant
14564 // offset if it is legal.
14565 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14566 CodeModel::Model M = DAG.getTarget().getCodeModel();
14567 auto PtrVT = getPointerTy(DAG.getDataLayout());
14569 if (OpFlags == X86II::MO_NO_FLAG &&
14570 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14571 // A direct static reference to a global.
14572 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14575 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14578 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14580 // With PIC, the address is actually $g + Offset.
14581 if (isGlobalRelativeToPICBase(OpFlags)) {
14582 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14583 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14586 // For globals that require a load from a stub to get the address, emit the
14588 if (isGlobalStubReference(OpFlags))
14589 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14590 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14592 // If there was a non-zero offset that we didn't fold, create an explicit
14593 // addition for it.
14595 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14596 DAG.getConstant(Offset, dl, PtrVT));
14602 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14603 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14604 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14605 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14609 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14610 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14611 unsigned char OperandFlags, bool LocalDynamic = false) {
14612 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14613 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14615 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14616 GA->getValueType(0),
14620 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14624 SDValue Ops[] = { Chain, TGA, *InFlag };
14625 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14627 SDValue Ops[] = { Chain, TGA };
14628 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14631 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14632 MFI.setAdjustsStack(true);
14633 MFI.setHasCalls(true);
14635 SDValue Flag = Chain.getValue(1);
14636 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14639 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14641 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14644 SDLoc dl(GA); // ? function entry point might be better
14645 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14646 DAG.getNode(X86ISD::GlobalBaseReg,
14647 SDLoc(), PtrVT), InFlag);
14648 InFlag = Chain.getValue(1);
14650 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14653 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14655 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14657 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14658 X86::RAX, X86II::MO_TLSGD);
14661 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14667 // Get the start address of the TLS block for this module.
14668 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14669 .getInfo<X86MachineFunctionInfo>();
14670 MFI->incNumLocalDynamicTLSAccesses();
14674 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14675 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14678 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14679 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14680 InFlag = Chain.getValue(1);
14681 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14682 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14685 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14689 unsigned char OperandFlags = X86II::MO_DTPOFF;
14690 unsigned WrapperKind = X86ISD::Wrapper;
14691 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14692 GA->getValueType(0),
14693 GA->getOffset(), OperandFlags);
14694 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14696 // Add x@dtpoff with the base.
14697 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14700 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14701 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14702 const EVT PtrVT, TLSModel::Model model,
14703 bool is64Bit, bool isPIC) {
14706 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14707 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14708 is64Bit ? 257 : 256));
14710 SDValue ThreadPointer =
14711 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14712 MachinePointerInfo(Ptr));
14714 unsigned char OperandFlags = 0;
14715 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14717 unsigned WrapperKind = X86ISD::Wrapper;
14718 if (model == TLSModel::LocalExec) {
14719 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14720 } else if (model == TLSModel::InitialExec) {
14722 OperandFlags = X86II::MO_GOTTPOFF;
14723 WrapperKind = X86ISD::WrapperRIP;
14725 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14728 llvm_unreachable("Unexpected model");
14731 // emit "addl x@ntpoff,%eax" (local exec)
14732 // or "addl x@indntpoff,%eax" (initial exec)
14733 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14735 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14736 GA->getOffset(), OperandFlags);
14737 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14739 if (model == TLSModel::InitialExec) {
14740 if (isPIC && !is64Bit) {
14741 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14742 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14746 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14747 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14750 // The address of the thread local variable is the add of the thread
14751 // pointer with the offset of the variable.
14752 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14756 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14758 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14760 if (DAG.getTarget().Options.EmulatedTLS)
14761 return LowerToTLSEmulatedModel(GA, DAG);
14763 const GlobalValue *GV = GA->getGlobal();
14764 auto PtrVT = getPointerTy(DAG.getDataLayout());
14765 bool PositionIndependent = isPositionIndependent();
14767 if (Subtarget.isTargetELF()) {
14768 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14770 case TLSModel::GeneralDynamic:
14771 if (Subtarget.is64Bit())
14772 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14773 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14774 case TLSModel::LocalDynamic:
14775 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14776 Subtarget.is64Bit());
14777 case TLSModel::InitialExec:
14778 case TLSModel::LocalExec:
14779 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14780 PositionIndependent);
14782 llvm_unreachable("Unknown TLS model.");
14785 if (Subtarget.isTargetDarwin()) {
14786 // Darwin only has one model of TLS. Lower to that.
14787 unsigned char OpFlag = 0;
14788 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14789 X86ISD::WrapperRIP : X86ISD::Wrapper;
14791 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14792 // global base reg.
14793 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14795 OpFlag = X86II::MO_TLVP_PIC_BASE;
14797 OpFlag = X86II::MO_TLVP;
14799 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14800 GA->getValueType(0),
14801 GA->getOffset(), OpFlag);
14802 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14804 // With PIC32, the address is actually $g + Offset.
14806 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14807 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14810 // Lowering the machine isd will make sure everything is in the right
14812 SDValue Chain = DAG.getEntryNode();
14813 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14814 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14815 SDValue Args[] = { Chain, Offset };
14816 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14817 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14818 DAG.getIntPtrConstant(0, DL, true),
14819 Chain.getValue(1), DL);
14821 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14822 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14823 MFI.setAdjustsStack(true);
14825 // And our return value (tls address) is in the standard call return value
14827 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14828 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14831 if (Subtarget.isTargetKnownWindowsMSVC() ||
14832 Subtarget.isTargetWindowsItanium() ||
14833 Subtarget.isTargetWindowsGNU()) {
14834 // Just use the implicit TLS architecture
14835 // Need to generate something similar to:
14836 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14838 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14839 // mov rcx, qword [rdx+rcx*8]
14840 // mov eax, .tls$:tlsvar
14841 // [rax+rcx] contains the address
14842 // Windows 64bit: gs:0x58
14843 // Windows 32bit: fs:__tls_array
14846 SDValue Chain = DAG.getEntryNode();
14848 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14849 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14850 // use its literal value of 0x2C.
14851 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14852 ? Type::getInt8PtrTy(*DAG.getContext(),
14854 : Type::getInt32PtrTy(*DAG.getContext(),
14857 SDValue TlsArray = Subtarget.is64Bit()
14858 ? DAG.getIntPtrConstant(0x58, dl)
14859 : (Subtarget.isTargetWindowsGNU()
14860 ? DAG.getIntPtrConstant(0x2C, dl)
14861 : DAG.getExternalSymbol("_tls_array", PtrVT));
14863 SDValue ThreadPointer =
14864 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14867 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14868 res = ThreadPointer;
14870 // Load the _tls_index variable
14871 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14872 if (Subtarget.is64Bit())
14873 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14874 MachinePointerInfo(), MVT::i32);
14876 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14878 auto &DL = DAG.getDataLayout();
14880 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14881 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14883 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14886 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14888 // Get the offset of start of .tls section
14889 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14890 GA->getValueType(0),
14891 GA->getOffset(), X86II::MO_SECREL);
14892 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14894 // The address of the thread local variable is the add of the thread
14895 // pointer with the offset of the variable.
14896 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14899 llvm_unreachable("TLS not implemented for this target.");
14902 /// Lower SRA_PARTS and friends, which return two i32 values
14903 /// and take a 2 x i32 value to shift plus a shift amount.
14904 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14905 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14906 MVT VT = Op.getSimpleValueType();
14907 unsigned VTBits = VT.getSizeInBits();
14909 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14910 SDValue ShOpLo = Op.getOperand(0);
14911 SDValue ShOpHi = Op.getOperand(1);
14912 SDValue ShAmt = Op.getOperand(2);
14913 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14914 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14916 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14917 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14918 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14919 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14920 : DAG.getConstant(0, dl, VT);
14922 SDValue Tmp2, Tmp3;
14923 if (Op.getOpcode() == ISD::SHL_PARTS) {
14924 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14925 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14927 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14928 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14931 // If the shift amount is larger or equal than the width of a part we can't
14932 // rely on the results of shld/shrd. Insert a test and select the appropriate
14933 // values for large shift amounts.
14934 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14935 DAG.getConstant(VTBits, dl, MVT::i8));
14936 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14937 AndNode, DAG.getConstant(0, dl, MVT::i8));
14940 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14941 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14942 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14944 if (Op.getOpcode() == ISD::SHL_PARTS) {
14945 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14946 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14948 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14949 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14952 SDValue Ops[2] = { Lo, Hi };
14953 return DAG.getMergeValues(Ops, dl);
14956 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14957 SelectionDAG &DAG) const {
14958 SDValue Src = Op.getOperand(0);
14959 MVT SrcVT = Src.getSimpleValueType();
14960 MVT VT = Op.getSimpleValueType();
14963 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14964 if (SrcVT.isVector()) {
14965 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14966 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14967 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14968 DAG.getUNDEF(SrcVT)));
14970 if (SrcVT.getVectorElementType() == MVT::i1) {
14971 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14972 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14973 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14974 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14975 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14976 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14981 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14982 "Unknown SINT_TO_FP to lower!");
14984 // These are really Legal; return the operand so the caller accepts it as
14986 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14988 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14989 Subtarget.is64Bit()) {
14993 SDValue ValueToStore = Op.getOperand(0);
14994 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14995 !Subtarget.is64Bit())
14996 // Bitcasting to f64 here allows us to do a single 64-bit store from
14997 // an SSE register, avoiding the store forwarding penalty that would come
14998 // with two 32-bit stores.
14999 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15001 unsigned Size = SrcVT.getSizeInBits()/8;
15002 MachineFunction &MF = DAG.getMachineFunction();
15003 auto PtrVT = getPointerTy(MF.getDataLayout());
15004 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15005 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15006 SDValue Chain = DAG.getStore(
15007 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15008 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15009 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15012 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15014 SelectionDAG &DAG) const {
15018 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15020 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15022 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15024 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15026 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15027 MachineMemOperand *MMO;
15029 int SSFI = FI->getIndex();
15030 MMO = DAG.getMachineFunction().getMachineMemOperand(
15031 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15032 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15034 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15035 StackSlot = StackSlot.getOperand(1);
15037 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15038 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15040 Tys, Ops, SrcVT, MMO);
15043 Chain = Result.getValue(1);
15044 SDValue InFlag = Result.getValue(2);
15046 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15047 // shouldn't be necessary except that RFP cannot be live across
15048 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15049 MachineFunction &MF = DAG.getMachineFunction();
15050 unsigned SSFISize = Op.getValueSizeInBits()/8;
15051 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15052 auto PtrVT = getPointerTy(MF.getDataLayout());
15053 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15054 Tys = DAG.getVTList(MVT::Other);
15056 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15058 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15059 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15060 MachineMemOperand::MOStore, SSFISize, SSFISize);
15062 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15063 Ops, Op.getValueType(), MMO);
15064 Result = DAG.getLoad(
15065 Op.getValueType(), DL, Chain, StackSlot,
15066 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15072 /// 64-bit unsigned integer to double expansion.
15073 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15074 SelectionDAG &DAG) const {
15075 // This algorithm is not obvious. Here it is what we're trying to output:
15078 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15079 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15081 haddpd %xmm0, %xmm0
15083 pshufd $0x4e, %xmm0, %xmm1
15089 LLVMContext *Context = DAG.getContext();
15091 // Build some magic constants.
15092 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15093 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15094 auto PtrVT = getPointerTy(DAG.getDataLayout());
15095 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15097 SmallVector<Constant*,2> CV1;
15099 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15100 APInt(64, 0x4330000000000000ULL))));
15102 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15103 APInt(64, 0x4530000000000000ULL))));
15104 Constant *C1 = ConstantVector::get(CV1);
15105 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15107 // Load the 64-bit value into an XMM register.
15108 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15111 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15112 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15113 /* Alignment = */ 16);
15115 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15118 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15119 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15120 /* Alignment = */ 16);
15121 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15122 // TODO: Are there any fast-math-flags to propagate here?
15123 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15126 if (Subtarget.hasSSE3()) {
15127 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15128 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15130 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15131 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15132 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15133 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15136 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15137 DAG.getIntPtrConstant(0, dl));
15140 /// 32-bit unsigned integer to float expansion.
15141 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15142 SelectionDAG &DAG) const {
15144 // FP constant to bias correct the final result.
15145 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15148 // Load the 32-bit value into an XMM register.
15149 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15152 // Zero out the upper parts of the register.
15153 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15155 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15156 DAG.getBitcast(MVT::v2f64, Load),
15157 DAG.getIntPtrConstant(0, dl));
15159 // Or the load with the bias.
15160 SDValue Or = DAG.getNode(
15161 ISD::OR, dl, MVT::v2i64,
15162 DAG.getBitcast(MVT::v2i64,
15163 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15164 DAG.getBitcast(MVT::v2i64,
15165 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15167 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15168 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15170 // Subtract the bias.
15171 // TODO: Are there any fast-math-flags to propagate here?
15172 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15174 // Handle final rounding.
15175 MVT DestVT = Op.getSimpleValueType();
15177 if (DestVT.bitsLT(MVT::f64))
15178 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15179 DAG.getIntPtrConstant(0, dl));
15180 if (DestVT.bitsGT(MVT::f64))
15181 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15183 // Handle final rounding.
15187 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15188 const X86Subtarget &Subtarget, SDLoc &DL) {
15189 if (Op.getSimpleValueType() != MVT::v2f64)
15192 SDValue N0 = Op.getOperand(0);
15193 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15195 // Legalize to v4i32 type.
15196 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15197 DAG.getUNDEF(MVT::v2i32));
15199 if (Subtarget.hasAVX512())
15200 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15202 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15203 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15204 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15205 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15207 // Two to the power of half-word-size.
15208 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15210 // Clear upper part of LO, lower HI.
15211 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15212 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15214 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15215 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15216 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15218 // Add the two halves.
15219 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15222 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15223 const X86Subtarget &Subtarget) {
15224 // The algorithm is the following:
15225 // #ifdef __SSE4_1__
15226 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15227 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15228 // (uint4) 0x53000000, 0xaa);
15230 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15231 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15233 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15234 // return (float4) lo + fhi;
15236 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15237 // reassociate the two FADDs, and if we do that, the algorithm fails
15238 // spectacularly (PR24512).
15239 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15240 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15241 // there's also the MachineCombiner reassociations happening on Machine IR.
15242 if (DAG.getTarget().Options.UnsafeFPMath)
15246 SDValue V = Op->getOperand(0);
15247 MVT VecIntVT = V.getSimpleValueType();
15248 bool Is128 = VecIntVT == MVT::v4i32;
15249 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15250 // If we convert to something else than the supported type, e.g., to v4f64,
15252 if (VecFloatVT != Op->getSimpleValueType(0))
15255 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15256 "Unsupported custom type");
15258 // In the #idef/#else code, we have in common:
15259 // - The vector of constants:
15265 // Create the splat vector for 0x4b000000.
15266 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15267 // Create the splat vector for 0x53000000.
15268 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15270 // Create the right shift.
15271 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15272 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15275 if (Subtarget.hasSSE41()) {
15276 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15277 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15278 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15279 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15280 // Low will be bitcasted right away, so do not bother bitcasting back to its
15282 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15283 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15284 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15285 // (uint4) 0x53000000, 0xaa);
15286 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15287 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15288 // High will be bitcasted right away, so do not bother bitcasting back to
15289 // its original type.
15290 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15291 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15293 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15294 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15295 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15296 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15298 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15299 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15302 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15303 SDValue VecCstFAdd = DAG.getConstantFP(
15304 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15306 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15307 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15308 // TODO: Are there any fast-math-flags to propagate here?
15310 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15311 // return (float4) lo + fhi;
15312 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15313 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15316 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15317 SelectionDAG &DAG) const {
15318 SDValue N0 = Op.getOperand(0);
15319 MVT SrcVT = N0.getSimpleValueType();
15322 if (SrcVT.getVectorElementType() == MVT::i1) {
15323 if (SrcVT == MVT::v2i1)
15324 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15325 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15326 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15327 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15328 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15331 switch (SrcVT.SimpleTy) {
15333 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15338 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15339 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15340 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15343 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15346 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15349 assert(Subtarget.hasAVX512());
15350 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15351 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15355 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15356 SelectionDAG &DAG) const {
15357 SDValue N0 = Op.getOperand(0);
15359 auto PtrVT = getPointerTy(DAG.getDataLayout());
15361 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15362 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15363 // the optimization here.
15364 if (DAG.SignBitIsZero(N0))
15365 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15367 if (Op.getSimpleValueType().isVector())
15368 return lowerUINT_TO_FP_vec(Op, DAG);
15370 MVT SrcVT = N0.getSimpleValueType();
15371 MVT DstVT = Op.getSimpleValueType();
15373 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15374 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15375 // Conversions from unsigned i32 to f32/f64 are legal,
15376 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15380 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15381 return LowerUINT_TO_FP_i64(Op, DAG);
15382 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15383 return LowerUINT_TO_FP_i32(Op, DAG);
15384 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15387 // Make a 64-bit buffer, and use it to build an FILD.
15388 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15389 if (SrcVT == MVT::i32) {
15390 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15391 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15392 StackSlot, MachinePointerInfo());
15393 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15394 OffsetSlot, MachinePointerInfo());
15395 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15399 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15400 SDValue ValueToStore = Op.getOperand(0);
15401 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15402 // Bitcasting to f64 here allows us to do a single 64-bit store from
15403 // an SSE register, avoiding the store forwarding penalty that would come
15404 // with two 32-bit stores.
15405 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15406 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15407 MachinePointerInfo());
15408 // For i64 source, we need to add the appropriate power of 2 if the input
15409 // was negative. This is the same as the optimization in
15410 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15411 // we must be careful to do the computation in x87 extended precision, not
15412 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15413 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15414 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15415 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15416 MachineMemOperand::MOLoad, 8, 8);
15418 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15419 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15420 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15423 APInt FF(32, 0x5F800000ULL);
15425 // Check whether the sign bit is set.
15426 SDValue SignSet = DAG.getSetCC(
15427 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15428 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15430 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15431 SDValue FudgePtr = DAG.getConstantPool(
15432 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15434 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15435 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15436 SDValue Four = DAG.getIntPtrConstant(4, dl);
15437 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
15439 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15441 // Load the value out, extending it from f32 to f80.
15442 // FIXME: Avoid the extend by constructing the right constant pool?
15443 SDValue Fudge = DAG.getExtLoad(
15444 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15445 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15446 /* Alignment = */ 4);
15447 // Extend everything to 80 bits to force it to be done on x87.
15448 // TODO: Are there any fast-math-flags to propagate here?
15449 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15450 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15451 DAG.getIntPtrConstant(0, dl));
15454 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15455 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15456 // just return an <SDValue(), SDValue()> pair.
15457 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15458 // to i16, i32 or i64, and we lower it to a legal sequence.
15459 // If lowered to the final integer result we return a <result, SDValue()> pair.
15460 // Otherwise we lower it to a sequence ending with a FIST, return a
15461 // <FIST, StackSlot> pair, and the caller is responsible for loading
15462 // the final integer result from StackSlot.
15463 std::pair<SDValue,SDValue>
15464 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15465 bool IsSigned, bool IsReplace) const {
15468 EVT DstTy = Op.getValueType();
15469 EVT TheVT = Op.getOperand(0).getValueType();
15470 auto PtrVT = getPointerTy(DAG.getDataLayout());
15472 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15473 // f16 must be promoted before using the lowering in this routine.
15474 // fp128 does not use this lowering.
15475 return std::make_pair(SDValue(), SDValue());
15478 // If using FIST to compute an unsigned i64, we'll need some fixup
15479 // to handle values above the maximum signed i64. A FIST is always
15480 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15481 bool UnsignedFixup = !IsSigned &&
15482 DstTy == MVT::i64 &&
15483 (!Subtarget.is64Bit() ||
15484 !isScalarFPTypeInSSEReg(TheVT));
15486 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15487 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15488 // The low 32 bits of the fist result will have the correct uint32 result.
15489 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15493 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15494 DstTy.getSimpleVT() >= MVT::i16 &&
15495 "Unknown FP_TO_INT to lower!");
15497 // These are really Legal.
15498 if (DstTy == MVT::i32 &&
15499 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15500 return std::make_pair(SDValue(), SDValue());
15501 if (Subtarget.is64Bit() &&
15502 DstTy == MVT::i64 &&
15503 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15504 return std::make_pair(SDValue(), SDValue());
15506 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15508 MachineFunction &MF = DAG.getMachineFunction();
15509 unsigned MemSize = DstTy.getSizeInBits()/8;
15510 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15511 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15514 switch (DstTy.getSimpleVT().SimpleTy) {
15515 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15516 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15517 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15518 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15521 SDValue Chain = DAG.getEntryNode();
15522 SDValue Value = Op.getOperand(0);
15523 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15525 if (UnsignedFixup) {
15527 // Conversion to unsigned i64 is implemented with a select,
15528 // depending on whether the source value fits in the range
15529 // of a signed i64. Let Thresh be the FP equivalent of
15530 // 0x8000000000000000ULL.
15532 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15533 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15534 // Fist-to-mem64 FistSrc
15535 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15536 // to XOR'ing the high 32 bits with Adjust.
15538 // Being a power of 2, Thresh is exactly representable in all FP formats.
15539 // For X87 we'd like to use the smallest FP type for this constant, but
15540 // for DAG type consistency we have to match the FP operand type.
15542 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15543 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15544 bool LosesInfo = false;
15545 if (TheVT == MVT::f64)
15546 // The rounding mode is irrelevant as the conversion should be exact.
15547 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15549 else if (TheVT == MVT::f80)
15550 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15551 APFloat::rmNearestTiesToEven, &LosesInfo);
15553 assert(Status == APFloat::opOK && !LosesInfo &&
15554 "FP conversion should have been exact");
15556 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15558 SDValue Cmp = DAG.getSetCC(DL,
15559 getSetCCResultType(DAG.getDataLayout(),
15560 *DAG.getContext(), TheVT),
15561 Value, ThreshVal, ISD::SETLT);
15562 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15563 DAG.getConstant(0, DL, MVT::i32),
15564 DAG.getConstant(0x80000000, DL, MVT::i32));
15565 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15566 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15567 *DAG.getContext(), TheVT),
15568 Value, ThreshVal, ISD::SETLT);
15569 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15572 // FIXME This causes a redundant load/store if the SSE-class value is already
15573 // in memory, such as if it is on the callstack.
15574 if (isScalarFPTypeInSSEReg(TheVT)) {
15575 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15576 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15577 MachinePointerInfo::getFixedStack(MF, SSFI));
15578 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15580 Chain, StackSlot, DAG.getValueType(TheVT)
15583 MachineMemOperand *MMO =
15584 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15585 MachineMemOperand::MOLoad, MemSize, MemSize);
15586 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15587 Chain = Value.getValue(1);
15588 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15589 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15592 MachineMemOperand *MMO =
15593 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15594 MachineMemOperand::MOStore, MemSize, MemSize);
15596 if (UnsignedFixup) {
15598 // Insert the FIST, load its result as two i32's,
15599 // and XOR the high i32 with Adjust.
15601 SDValue FistOps[] = { Chain, Value, StackSlot };
15602 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15603 FistOps, DstTy, MMO);
15606 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15607 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15610 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15611 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15613 if (Subtarget.is64Bit()) {
15614 // Join High32 and Low32 into a 64-bit result.
15615 // (High32 << 32) | Low32
15616 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15617 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15618 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15619 DAG.getConstant(32, DL, MVT::i8));
15620 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15621 return std::make_pair(Result, SDValue());
15624 SDValue ResultOps[] = { Low32, High32 };
15626 SDValue pair = IsReplace
15627 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15628 : DAG.getMergeValues(ResultOps, DL);
15629 return std::make_pair(pair, SDValue());
15631 // Build the FP_TO_INT*_IN_MEM
15632 SDValue Ops[] = { Chain, Value, StackSlot };
15633 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15635 return std::make_pair(FIST, StackSlot);
15639 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15640 const X86Subtarget &Subtarget) {
15641 MVT VT = Op->getSimpleValueType(0);
15642 SDValue In = Op->getOperand(0);
15643 MVT InVT = In.getSimpleValueType();
15646 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15647 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15649 // Optimize vectors in AVX mode:
15652 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15653 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15654 // Concat upper and lower parts.
15657 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15658 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15659 // Concat upper and lower parts.
15662 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15663 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15664 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15667 if (Subtarget.hasInt256())
15668 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15670 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15671 SDValue Undef = DAG.getUNDEF(InVT);
15672 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15673 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15674 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15676 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15677 VT.getVectorNumElements()/2);
15679 OpLo = DAG.getBitcast(HVT, OpLo);
15680 OpHi = DAG.getBitcast(HVT, OpHi);
15682 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15685 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15686 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15687 MVT VT = Op->getSimpleValueType(0);
15688 SDValue In = Op->getOperand(0);
15689 MVT InVT = In.getSimpleValueType();
15691 unsigned NumElts = VT.getVectorNumElements();
15693 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15694 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15695 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15697 if (InVT.getVectorElementType() != MVT::i1)
15700 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15702 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15703 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15706 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15708 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15710 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15712 return SelectedVal;
15713 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15716 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15717 SelectionDAG &DAG) {
15718 if (Subtarget.hasFp256())
15719 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15725 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15726 SelectionDAG &DAG) {
15728 MVT VT = Op.getSimpleValueType();
15729 SDValue In = Op.getOperand(0);
15730 MVT SVT = In.getSimpleValueType();
15732 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15733 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15735 if (Subtarget.hasFp256())
15736 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15739 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15740 VT.getVectorNumElements() != SVT.getVectorNumElements());
15744 /// Helper to recursively truncate vector elements in half with PACKSS.
15745 /// It makes use of the fact that vector comparison results will be all-zeros
15746 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15747 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15748 /// within each 128-bit lane.
15749 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15752 const X86Subtarget &Subtarget) {
15753 // Requires SSE2 but AVX512 has fast truncate.
15754 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15757 EVT SrcVT = In.getValueType();
15759 // No truncation required, we might get here due to recursive calls.
15760 if (SrcVT == DstVT)
15763 // We only support vector truncation to 128bits or greater from a
15764 // 256bits or greater source.
15765 if ((DstVT.getSizeInBits() % 128) != 0)
15767 if ((SrcVT.getSizeInBits() % 256) != 0)
15770 unsigned NumElems = SrcVT.getVectorNumElements();
15771 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15772 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15775 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15777 // Extract lower/upper subvectors.
15778 unsigned NumSubElts = NumElems / 2;
15779 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15780 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15781 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15783 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15784 if (SrcVT.is256BitVector()) {
15785 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15786 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15787 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15788 return DAG.getBitcast(DstVT, Res);
15791 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15792 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15793 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15794 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15795 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15796 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15798 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15799 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15800 Res = DAG.getBitcast(MVT::v4i64, Res);
15801 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15803 if (DstVT.is256BitVector())
15804 return DAG.getBitcast(DstVT, Res);
15806 // If 512bit -> 128bit truncate another stage.
15807 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15808 Res = DAG.getBitcast(PackedVT, Res);
15809 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15812 // Recursively pack lower/upper subvectors, concat result and pack again.
15813 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15814 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15815 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15816 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15818 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15819 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15820 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15823 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15824 const X86Subtarget &Subtarget) {
15827 MVT VT = Op.getSimpleValueType();
15828 SDValue In = Op.getOperand(0);
15829 MVT InVT = In.getSimpleValueType();
15831 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15833 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15834 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15835 if (InVT.getScalarSizeInBits() <= 16) {
15836 if (Subtarget.hasBWI()) {
15837 // legal, will go to VPMOVB2M, VPMOVW2M
15838 // Shift packed bytes not supported natively, bitcast to word
15839 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15840 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15841 DAG.getBitcast(ExtVT, In),
15842 DAG.getConstant(ShiftInx, DL, ExtVT));
15843 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15844 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15846 // Use TESTD/Q, extended vector to packed dword/qword.
15847 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15848 "Unexpected vector type.");
15849 unsigned NumElts = InVT.getVectorNumElements();
15850 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15851 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15853 ShiftInx = InVT.getScalarSizeInBits() - 1;
15856 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15857 DAG.getConstant(ShiftInx, DL, InVT));
15858 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15861 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15863 MVT VT = Op.getSimpleValueType();
15864 SDValue In = Op.getOperand(0);
15865 MVT InVT = In.getSimpleValueType();
15867 if (VT == MVT::i1) {
15868 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15869 "Invalid scalar TRUNCATE operation");
15870 if (InVT.getSizeInBits() >= 32)
15872 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15873 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15875 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15876 "Invalid TRUNCATE operation");
15878 if (VT.getVectorElementType() == MVT::i1)
15879 return LowerTruncateVecI1(Op, DAG, Subtarget);
15881 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15882 if (Subtarget.hasAVX512()) {
15883 // word to byte only under BWI
15884 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15885 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15886 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15887 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15890 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15891 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15892 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15895 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15896 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15897 if (Subtarget.hasInt256()) {
15898 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15899 In = DAG.getBitcast(MVT::v8i32, In);
15900 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15901 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15902 DAG.getIntPtrConstant(0, DL));
15905 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15906 DAG.getIntPtrConstant(0, DL));
15907 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15908 DAG.getIntPtrConstant(2, DL));
15909 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15910 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15911 static const int ShufMask[] = {0, 2, 4, 6};
15912 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15915 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15916 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15917 if (Subtarget.hasInt256()) {
15918 In = DAG.getBitcast(MVT::v32i8, In);
15920 // The PSHUFB mask:
15921 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15922 -1, -1, -1, -1, -1, -1, -1, -1,
15923 16, 17, 20, 21, 24, 25, 28, 29,
15924 -1, -1, -1, -1, -1, -1, -1, -1 };
15925 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15926 In = DAG.getBitcast(MVT::v4i64, In);
15928 static const int ShufMask2[] = {0, 2, -1, -1};
15929 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15930 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15931 DAG.getIntPtrConstant(0, DL));
15932 return DAG.getBitcast(VT, In);
15935 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15936 DAG.getIntPtrConstant(0, DL));
15938 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15939 DAG.getIntPtrConstant(4, DL));
15941 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15942 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15944 // The PSHUFB mask:
15945 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15946 -1, -1, -1, -1, -1, -1, -1, -1};
15948 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15949 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15951 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15952 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15954 // The MOVLHPS Mask:
15955 static const int ShufMask2[] = {0, 1, 4, 5};
15956 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15957 return DAG.getBitcast(MVT::v8i16, res);
15960 // Handle truncation of V256 to V128 using shuffles.
15961 if (!VT.is128BitVector() || !InVT.is256BitVector())
15964 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15966 unsigned NumElems = VT.getVectorNumElements();
15967 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15969 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15970 // Prepare truncation shuffle mask
15971 for (unsigned i = 0; i != NumElems; ++i)
15972 MaskVec[i] = i * 2;
15973 In = DAG.getBitcast(NVT, In);
15974 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15975 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15976 DAG.getIntPtrConstant(0, DL));
15979 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15980 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15981 MVT VT = Op.getSimpleValueType();
15983 if (VT.isVector()) {
15984 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15985 SDValue Src = Op.getOperand(0);
15987 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15988 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15989 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15990 DAG.getUNDEF(MVT::v2f32)));
15996 assert(!VT.isVector());
15998 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15999 IsSigned, /*IsReplace=*/ false);
16000 SDValue FIST = Vals.first, StackSlot = Vals.second;
16001 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16002 if (!FIST.getNode())
16005 if (StackSlot.getNode())
16006 // Load the result.
16007 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16009 // The node is the result.
16013 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16015 MVT VT = Op.getSimpleValueType();
16016 SDValue In = Op.getOperand(0);
16017 MVT SVT = In.getSimpleValueType();
16019 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16021 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16022 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16023 In, DAG.getUNDEF(SVT)));
16026 /// The only differences between FABS and FNEG are the mask and the logic op.
16027 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16028 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16029 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16030 "Wrong opcode for lowering FABS or FNEG.");
16032 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16034 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16035 // into an FNABS. We'll lower the FABS after that if it is still in use.
16037 for (SDNode *User : Op->uses())
16038 if (User->getOpcode() == ISD::FNEG)
16042 MVT VT = Op.getSimpleValueType();
16044 bool IsF128 = (VT == MVT::f128);
16046 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16047 // decide if we should generate a 16-byte constant mask when we only need 4 or
16048 // 8 bytes for the scalar case.
16053 if (VT.isVector()) {
16055 EltVT = VT.getVectorElementType();
16056 } else if (IsF128) {
16057 // SSE instructions are used for optimized f128 logical operations.
16058 LogicVT = MVT::f128;
16061 // There are no scalar bitwise logical SSE/AVX instructions, so we
16062 // generate a 16-byte vector constant and logic op even for the scalar case.
16063 // Using a 16-byte mask allows folding the load of the mask with
16064 // the logic op, so it can save (~4 bytes) on code size.
16065 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16069 unsigned EltBits = EltVT.getSizeInBits();
16070 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16072 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16073 const fltSemantics &Sem =
16074 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16075 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16076 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16078 SDValue Op0 = Op.getOperand(0);
16079 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16081 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16082 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16084 if (VT.isVector() || IsF128)
16085 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16087 // For the scalar case extend to a 128-bit vector, perform the logic op,
16088 // and extract the scalar result back out.
16089 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16090 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16091 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16092 DAG.getIntPtrConstant(0, dl));
16095 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16096 SDValue Mag = Op.getOperand(0);
16097 SDValue Sign = Op.getOperand(1);
16100 // If the sign operand is smaller, extend it first.
16101 MVT VT = Op.getSimpleValueType();
16102 if (Sign.getSimpleValueType().bitsLT(VT))
16103 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16105 // And if it is bigger, shrink it first.
16106 if (Sign.getSimpleValueType().bitsGT(VT))
16107 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16109 // At this point the operands and the result should have the same
16110 // type, and that won't be f80 since that is not custom lowered.
16111 bool IsF128 = (VT == MVT::f128);
16112 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16113 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16114 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16115 "Unexpected type in LowerFCOPYSIGN");
16117 MVT EltVT = VT.getScalarType();
16118 const fltSemantics &Sem =
16119 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16120 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16122 // Perform all scalar logic operations as 16-byte vectors because there are no
16123 // scalar FP logic instructions in SSE.
16124 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16125 // unnecessary splats, but we might miss load folding opportunities. Should
16126 // this decision be based on OptimizeForSize?
16127 bool IsFakeVector = !VT.isVector() && !IsF128;
16130 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16132 // The mask constants are automatically splatted for vector types.
16133 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16134 SDValue SignMask = DAG.getConstantFP(
16135 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16136 SDValue MagMask = DAG.getConstantFP(
16137 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16139 // First, clear all bits but the sign bit from the second operand (sign).
16141 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16142 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16144 // Next, clear the sign bit from the first operand (magnitude).
16145 // TODO: If we had general constant folding for FP logic ops, this check
16146 // wouldn't be necessary.
16148 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16149 APFloat APF = Op0CN->getValueAPF();
16151 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16153 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16155 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16156 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16159 // OR the magnitude value with the sign bit.
16160 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16161 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16162 DAG.getIntPtrConstant(0, dl));
16165 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16166 SDValue N0 = Op.getOperand(0);
16168 MVT VT = Op.getSimpleValueType();
16170 MVT OpVT = N0.getSimpleValueType();
16171 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16172 "Unexpected type for FGETSIGN");
16174 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16175 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16176 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16177 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16178 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16179 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16183 // Check whether an OR'd tree is PTEST-able.
16184 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16185 SelectionDAG &DAG) {
16186 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16188 if (!Subtarget.hasSSE41())
16191 if (!Op->hasOneUse())
16194 SDNode *N = Op.getNode();
16197 SmallVector<SDValue, 8> Opnds;
16198 DenseMap<SDValue, unsigned> VecInMap;
16199 SmallVector<SDValue, 8> VecIns;
16200 EVT VT = MVT::Other;
16202 // Recognize a special case where a vector is casted into wide integer to
16204 Opnds.push_back(N->getOperand(0));
16205 Opnds.push_back(N->getOperand(1));
16207 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16208 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16209 // BFS traverse all OR'd operands.
16210 if (I->getOpcode() == ISD::OR) {
16211 Opnds.push_back(I->getOperand(0));
16212 Opnds.push_back(I->getOperand(1));
16213 // Re-evaluate the number of nodes to be traversed.
16214 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16218 // Quit if a non-EXTRACT_VECTOR_ELT
16219 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16222 // Quit if without a constant index.
16223 SDValue Idx = I->getOperand(1);
16224 if (!isa<ConstantSDNode>(Idx))
16227 SDValue ExtractedFromVec = I->getOperand(0);
16228 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16229 if (M == VecInMap.end()) {
16230 VT = ExtractedFromVec.getValueType();
16231 // Quit if not 128/256-bit vector.
16232 if (!VT.is128BitVector() && !VT.is256BitVector())
16234 // Quit if not the same type.
16235 if (VecInMap.begin() != VecInMap.end() &&
16236 VT != VecInMap.begin()->first.getValueType())
16238 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16239 VecIns.push_back(ExtractedFromVec);
16241 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16244 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16245 "Not extracted from 128-/256-bit vector.");
16247 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16249 for (DenseMap<SDValue, unsigned>::const_iterator
16250 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16251 // Quit if not all elements are used.
16252 if (I->second != FullMask)
16256 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16258 // Cast all vectors into TestVT for PTEST.
16259 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16260 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16262 // If more than one full vector is evaluated, OR them first before PTEST.
16263 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16264 // Each iteration will OR 2 nodes and append the result until there is only
16265 // 1 node left, i.e. the final OR'd value of all vectors.
16266 SDValue LHS = VecIns[Slot];
16267 SDValue RHS = VecIns[Slot + 1];
16268 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16271 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16274 /// \brief return true if \c Op has a use that doesn't just read flags.
16275 static bool hasNonFlagsUse(SDValue Op) {
16276 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16278 SDNode *User = *UI;
16279 unsigned UOpNo = UI.getOperandNo();
16280 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16281 // Look pass truncate.
16282 UOpNo = User->use_begin().getOperandNo();
16283 User = *User->use_begin();
16286 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16287 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16293 // Emit KTEST instruction for bit vectors on AVX-512
16294 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16295 const X86Subtarget &Subtarget) {
16296 if (Op.getOpcode() == ISD::BITCAST) {
16297 auto hasKTEST = [&](MVT VT) {
16298 unsigned SizeInBits = VT.getSizeInBits();
16299 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16300 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16302 SDValue Op0 = Op.getOperand(0);
16303 MVT Op0VT = Op0.getValueType().getSimpleVT();
16304 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16306 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16311 /// Emit nodes that will be selected as "test Op0,Op0", or something
16313 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16314 SelectionDAG &DAG) const {
16315 if (Op.getValueType() == MVT::i1) {
16316 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16317 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16318 DAG.getConstant(0, dl, MVT::i8));
16320 // CF and OF aren't always set the way we want. Determine which
16321 // of these we need.
16322 bool NeedCF = false;
16323 bool NeedOF = false;
16326 case X86::COND_A: case X86::COND_AE:
16327 case X86::COND_B: case X86::COND_BE:
16330 case X86::COND_G: case X86::COND_GE:
16331 case X86::COND_L: case X86::COND_LE:
16332 case X86::COND_O: case X86::COND_NO: {
16333 // Check if we really need to set the
16334 // Overflow flag. If NoSignedWrap is present
16335 // that is not actually needed.
16336 switch (Op->getOpcode()) {
16341 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
16342 if (BinNode->Flags.hasNoSignedWrap())
16352 // See if we can use the EFLAGS value from the operand instead of
16353 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16354 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16355 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16356 // Emit KTEST for bit vectors
16357 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16359 // Emit a CMP with 0, which is the TEST pattern.
16360 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16361 DAG.getConstant(0, dl, Op.getValueType()));
16363 unsigned Opcode = 0;
16364 unsigned NumOperands = 0;
16366 // Truncate operations may prevent the merge of the SETCC instruction
16367 // and the arithmetic instruction before it. Attempt to truncate the operands
16368 // of the arithmetic instruction and use a reduced bit-width instruction.
16369 bool NeedTruncation = false;
16370 SDValue ArithOp = Op;
16371 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16372 SDValue Arith = Op->getOperand(0);
16373 // Both the trunc and the arithmetic op need to have one user each.
16374 if (Arith->hasOneUse())
16375 switch (Arith.getOpcode()) {
16382 NeedTruncation = true;
16388 // Sometimes flags can be set either with an AND or with an SRL/SHL
16389 // instruction. SRL/SHL variant should be preferred for masks longer than this
16391 const int ShiftToAndMaxMaskWidth = 32;
16392 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16394 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16395 // which may be the result of a CAST. We use the variable 'Op', which is the
16396 // non-casted variable when we check for possible users.
16397 switch (ArithOp.getOpcode()) {
16399 // Due to an isel shortcoming, be conservative if this add is likely to be
16400 // selected as part of a load-modify-store instruction. When the root node
16401 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16402 // uses of other nodes in the match, such as the ADD in this case. This
16403 // leads to the ADD being left around and reselected, with the result being
16404 // two adds in the output. Alas, even if none our users are stores, that
16405 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16406 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16407 // climbing the DAG back to the root, and it doesn't seem to be worth the
16409 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16410 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16411 if (UI->getOpcode() != ISD::CopyToReg &&
16412 UI->getOpcode() != ISD::SETCC &&
16413 UI->getOpcode() != ISD::STORE)
16416 if (ConstantSDNode *C =
16417 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16418 // An add of one will be selected as an INC.
16419 if (C->isOne() && !Subtarget.slowIncDec()) {
16420 Opcode = X86ISD::INC;
16425 // An add of negative one (subtract of one) will be selected as a DEC.
16426 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16427 Opcode = X86ISD::DEC;
16433 // Otherwise use a regular EFLAGS-setting add.
16434 Opcode = X86ISD::ADD;
16439 // If we have a constant logical shift that's only used in a comparison
16440 // against zero turn it into an equivalent AND. This allows turning it into
16441 // a TEST instruction later.
16442 if (ZeroCheck && Op->hasOneUse() &&
16443 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16444 EVT VT = Op.getValueType();
16445 unsigned BitWidth = VT.getSizeInBits();
16446 unsigned ShAmt = Op->getConstantOperandVal(1);
16447 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16449 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16450 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16451 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16452 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16454 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16455 DAG.getConstant(Mask, dl, VT));
16460 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16461 // because a TEST instruction will be better. However, AND should be
16462 // preferred if the instruction can be combined into ANDN.
16463 if (!hasNonFlagsUse(Op)) {
16464 SDValue Op0 = ArithOp->getOperand(0);
16465 SDValue Op1 = ArithOp->getOperand(1);
16466 EVT VT = ArithOp.getValueType();
16467 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16468 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16469 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16471 // If we cannot select an ANDN instruction, check if we can replace
16472 // AND+IMM64 with a shift before giving up. This is possible for masks
16473 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16474 if (!isProperAndn) {
16478 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16479 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16483 const APInt &Mask = CN->getAPIntValue();
16484 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16485 break; // Prefer TEST instruction.
16487 unsigned BitWidth = Mask.getBitWidth();
16488 unsigned LeadingOnes = Mask.countLeadingOnes();
16489 unsigned TrailingZeros = Mask.countTrailingZeros();
16491 if (LeadingOnes + TrailingZeros == BitWidth) {
16492 assert(TrailingZeros < VT.getSizeInBits() &&
16493 "Shift amount should be less than the type width");
16494 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16495 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16496 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16500 unsigned LeadingZeros = Mask.countLeadingZeros();
16501 unsigned TrailingOnes = Mask.countTrailingOnes();
16503 if (LeadingZeros + TrailingOnes == BitWidth) {
16504 assert(LeadingZeros < VT.getSizeInBits() &&
16505 "Shift amount should be less than the type width");
16506 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16507 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16508 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16519 // Due to the ISEL shortcoming noted above, be conservative if this op is
16520 // likely to be selected as part of a load-modify-store instruction.
16521 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16522 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16523 if (UI->getOpcode() == ISD::STORE)
16526 // Otherwise use a regular EFLAGS-setting instruction.
16527 switch (ArithOp.getOpcode()) {
16528 default: llvm_unreachable("unexpected operator!");
16529 case ISD::SUB: Opcode = X86ISD::SUB; break;
16530 case ISD::XOR: Opcode = X86ISD::XOR; break;
16531 case ISD::AND: Opcode = X86ISD::AND; break;
16533 if (!NeedTruncation && ZeroCheck) {
16534 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16537 Opcode = X86ISD::OR;
16551 return SDValue(Op.getNode(), 1);
16557 // If we found that truncation is beneficial, perform the truncation and
16559 if (NeedTruncation) {
16560 EVT VT = Op.getValueType();
16561 SDValue WideVal = Op->getOperand(0);
16562 EVT WideVT = WideVal.getValueType();
16563 unsigned ConvertedOp = 0;
16564 // Use a target machine opcode to prevent further DAGCombine
16565 // optimizations that may separate the arithmetic operations
16566 // from the setcc node.
16567 switch (WideVal.getOpcode()) {
16569 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16570 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16571 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16572 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16573 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16578 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16579 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16580 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16581 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16587 // Emit KTEST for bit vectors
16588 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16591 // Emit a CMP with 0, which is the TEST pattern.
16592 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16593 DAG.getConstant(0, dl, Op.getValueType()));
16595 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16596 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16598 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16599 DAG.ReplaceAllUsesWith(Op, New);
16600 return SDValue(New.getNode(), 1);
16603 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16605 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16606 const SDLoc &dl, SelectionDAG &DAG) const {
16607 if (isNullConstant(Op1))
16608 return EmitTest(Op0, X86CC, dl, DAG);
16610 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16611 "Unexpected comparison operation for MVT::i1 operands");
16613 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16614 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16615 // Only promote the compare up to I32 if it is a 16 bit operation
16616 // with an immediate. 16 bit immediates are to be avoided.
16617 if ((Op0.getValueType() == MVT::i16 &&
16618 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16619 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16620 !Subtarget.isAtom()) {
16621 unsigned ExtendOp =
16622 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16623 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16624 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16626 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16627 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16628 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16630 return SDValue(Sub.getNode(), 1);
16632 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16635 /// Convert a comparison if required by the subtarget.
16636 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16637 SelectionDAG &DAG) const {
16638 // If the subtarget does not support the FUCOMI instruction, floating-point
16639 // comparisons have to be converted.
16640 if (Subtarget.hasCMov() ||
16641 Cmp.getOpcode() != X86ISD::CMP ||
16642 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16643 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16646 // The instruction selector will select an FUCOM instruction instead of
16647 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16648 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16649 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16651 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16652 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16653 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16654 DAG.getConstant(8, dl, MVT::i8));
16655 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16657 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16658 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16659 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16662 /// Check if replacement of SQRT with RSQRT should be disabled.
16663 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16664 EVT VT = Op.getValueType();
16666 // We never want to use both SQRT and RSQRT instructions for the same input.
16667 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16671 return Subtarget.hasFastVectorFSQRT();
16672 return Subtarget.hasFastScalarFSQRT();
16675 /// The minimum architected relative accuracy is 2^-12. We need one
16676 /// Newton-Raphson step to have a good float result (24 bits of precision).
16677 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16678 SelectionDAG &DAG, int Enabled,
16679 int &RefinementSteps,
16680 bool &UseOneConstNR,
16681 bool Reciprocal) const {
16682 EVT VT = Op.getValueType();
16684 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16685 // TODO: Add support for AVX512 (v16f32).
16686 // It is likely not profitable to do this for f64 because a double-precision
16687 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16688 // instructions: convert to single, rsqrtss, convert back to double, refine
16689 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16690 // along with FMA, this could be a throughput win.
16691 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16692 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16693 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16694 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16695 RefinementSteps = 1;
16697 UseOneConstNR = false;
16698 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16703 /// The minimum architected relative accuracy is 2^-12. We need one
16704 /// Newton-Raphson step to have a good float result (24 bits of precision).
16705 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16707 int &RefinementSteps) const {
16708 EVT VT = Op.getValueType();
16710 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16711 // TODO: Add support for AVX512 (v16f32).
16712 // It is likely not profitable to do this for f64 because a double-precision
16713 // reciprocal estimate with refinement on x86 prior to FMA requires
16714 // 15 instructions: convert to single, rcpss, convert back to double, refine
16715 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16716 // along with FMA, this could be a throughput win.
16718 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16719 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16720 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16721 // Enable estimate codegen with 1 refinement step for vector division.
16722 // Scalar division estimates are disabled because they break too much
16723 // real-world code. These defaults are intended to match GCC behavior.
16724 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16727 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16728 RefinementSteps = 1;
16730 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16735 /// If we have at least two divisions that use the same divisor, convert to
16736 /// multiplication by a reciprocal. This may need to be adjusted for a given
16737 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16738 /// This is because we still need one division to calculate the reciprocal and
16739 /// then we need two multiplies by that reciprocal as replacements for the
16740 /// original divisions.
16741 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16745 /// Helper for creating a X86ISD::SETCC node.
16746 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16747 SelectionDAG &DAG) {
16748 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16749 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16752 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16753 /// according to equal/not-equal condition code \p CC.
16754 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16755 const SDLoc &dl, SelectionDAG &DAG) {
16756 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16757 // instruction. Since the shift amount is in-range-or-undefined, we know
16758 // that doing a bittest on the i32 value is ok. We extend to i32 because
16759 // the encoding for the i16 version is larger than the i32 version.
16760 // Also promote i16 to i32 for performance / code size reason.
16761 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16762 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16764 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16765 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16766 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16767 // known to be zero.
16768 if (Src.getValueType() == MVT::i64 &&
16769 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16770 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16772 // If the operand types disagree, extend the shift amount to match. Since
16773 // BT ignores high bits (like shifts) we can use anyextend.
16774 if (Src.getValueType() != BitNo.getValueType())
16775 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16777 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16778 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16779 return getSETCC(Cond, BT, dl , DAG);
16782 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16783 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16784 const SDLoc &dl, SelectionDAG &DAG) {
16785 SDValue Op0 = And.getOperand(0);
16786 SDValue Op1 = And.getOperand(1);
16787 if (Op0.getOpcode() == ISD::TRUNCATE)
16788 Op0 = Op0.getOperand(0);
16789 if (Op1.getOpcode() == ISD::TRUNCATE)
16790 Op1 = Op1.getOperand(0);
16793 if (Op1.getOpcode() == ISD::SHL)
16794 std::swap(Op0, Op1);
16795 if (Op0.getOpcode() == ISD::SHL) {
16796 if (isOneConstant(Op0.getOperand(0))) {
16797 // If we looked past a truncate, check that it's only truncating away
16799 unsigned BitWidth = Op0.getValueSizeInBits();
16800 unsigned AndBitWidth = And.getValueSizeInBits();
16801 if (BitWidth > AndBitWidth) {
16803 DAG.computeKnownBits(Op0, Zeros, Ones);
16804 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16808 RHS = Op0.getOperand(1);
16810 } else if (Op1.getOpcode() == ISD::Constant) {
16811 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16812 uint64_t AndRHSVal = AndRHS->getZExtValue();
16813 SDValue AndLHS = Op0;
16815 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16816 LHS = AndLHS.getOperand(0);
16817 RHS = AndLHS.getOperand(1);
16820 // Use BT if the immediate can't be encoded in a TEST instruction.
16821 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16823 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16828 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16833 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16834 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16835 const SDLoc &dl, SelectionDAG &DAG) {
16837 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16838 "Expected TRUNCATE to i1 node");
16840 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16843 SDValue ShiftRight = Op.getOperand(0);
16844 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16848 /// Result of 'and' or 'trunc to i1' is compared against zero.
16849 /// Change to a BT node if possible.
16850 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16851 const SDLoc &dl, SelectionDAG &DAG) const {
16852 if (Op.getOpcode() == ISD::AND)
16853 return LowerAndToBT(Op, CC, dl, DAG);
16854 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16855 return LowerTruncateToBT(Op, CC, dl, DAG);
16859 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16861 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16866 // SSE Condition code mapping:
16875 switch (SetCCOpcode) {
16876 default: llvm_unreachable("Unexpected SETCC condition");
16878 case ISD::SETEQ: SSECC = 0; break;
16880 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16882 case ISD::SETOLT: SSECC = 1; break;
16884 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16886 case ISD::SETOLE: SSECC = 2; break;
16887 case ISD::SETUO: SSECC = 3; break;
16889 case ISD::SETNE: SSECC = 4; break;
16890 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16891 case ISD::SETUGE: SSECC = 5; break;
16892 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16893 case ISD::SETUGT: SSECC = 6; break;
16894 case ISD::SETO: SSECC = 7; break;
16896 case ISD::SETONE: SSECC = 8; break;
16899 std::swap(Op0, Op1);
16904 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16905 /// concatenate the result back.
16906 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16907 MVT VT = Op.getSimpleValueType();
16909 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16910 "Unsupported value type for operation");
16912 unsigned NumElems = VT.getVectorNumElements();
16914 SDValue CC = Op.getOperand(2);
16916 // Extract the LHS vectors
16917 SDValue LHS = Op.getOperand(0);
16918 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16919 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16921 // Extract the RHS vectors
16922 SDValue RHS = Op.getOperand(1);
16923 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16924 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16926 // Issue the operation on the smaller types and concatenate the result back
16927 MVT EltVT = VT.getVectorElementType();
16928 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16929 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16930 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16931 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16934 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16935 SDValue Op0 = Op.getOperand(0);
16936 SDValue Op1 = Op.getOperand(1);
16937 SDValue CC = Op.getOperand(2);
16938 MVT VT = Op.getSimpleValueType();
16941 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16942 "Unexpected type for boolean compare operation");
16943 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16944 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16945 DAG.getConstant(-1, dl, VT));
16946 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16947 DAG.getConstant(-1, dl, VT));
16948 switch (SetCCOpcode) {
16949 default: llvm_unreachable("Unexpected SETCC condition");
16951 // (x == y) -> ~(x ^ y)
16952 return DAG.getNode(ISD::XOR, dl, VT,
16953 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16954 DAG.getConstant(-1, dl, VT));
16956 // (x != y) -> (x ^ y)
16957 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16960 // (x > y) -> (x & ~y)
16961 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16964 // (x < y) -> (~x & y)
16965 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16968 // (x <= y) -> (~x | y)
16969 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16972 // (x >=y) -> (x | ~y)
16973 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16977 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16979 SDValue Op0 = Op.getOperand(0);
16980 SDValue Op1 = Op.getOperand(1);
16981 SDValue CC = Op.getOperand(2);
16982 MVT VT = Op.getSimpleValueType();
16985 assert(VT.getVectorElementType() == MVT::i1 &&
16986 "Cannot set masked compare for this operation");
16988 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16990 bool Unsigned = false;
16993 switch (SetCCOpcode) {
16994 default: llvm_unreachable("Unexpected SETCC condition");
16995 case ISD::SETNE: SSECC = 4; break;
16996 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16997 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16998 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16999 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17000 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17001 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17002 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17003 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17004 case ISD::SETLE: SSECC = 2; break;
17008 std::swap(Op0, Op1);
17010 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17011 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17012 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17013 DAG.getConstant(SSECC, dl, MVT::i8));
17016 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17017 /// operand \p Op1. If non-trivial (for example because it's not constant)
17018 /// return an empty value.
17019 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17020 SelectionDAG &DAG) {
17021 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17025 MVT VT = Op1.getSimpleValueType();
17026 MVT EVT = VT.getVectorElementType();
17027 unsigned n = VT.getVectorNumElements();
17028 SmallVector<SDValue, 8> ULTOp1;
17030 for (unsigned i = 0; i < n; ++i) {
17031 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17032 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17035 // Avoid underflow.
17036 APInt Val = Elt->getAPIntValue();
17040 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17043 return DAG.getBuildVector(VT, dl, ULTOp1);
17046 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17047 SelectionDAG &DAG) {
17048 SDValue Op0 = Op.getOperand(0);
17049 SDValue Op1 = Op.getOperand(1);
17050 SDValue CC = Op.getOperand(2);
17051 MVT VT = Op.getSimpleValueType();
17052 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17053 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17058 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17059 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17063 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17064 assert(VT.getVectorNumElements() <= 16);
17065 Opc = X86ISD::CMPM;
17067 Opc = X86ISD::CMPP;
17068 // The SSE/AVX packed FP comparison nodes are defined with a
17069 // floating-point vector result that matches the operand type. This allows
17070 // them to work with an SSE1 target (integer vector types are not legal).
17071 VT = Op0.getSimpleValueType();
17074 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17075 // emit two comparisons and a logic op to tie them together.
17076 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17079 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17081 // LLVM predicate is SETUEQ or SETONE.
17083 unsigned CombineOpc;
17084 if (SetCCOpcode == ISD::SETUEQ) {
17087 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17088 static_cast<unsigned>(ISD::OR);
17090 assert(SetCCOpcode == ISD::SETONE);
17093 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17094 static_cast<unsigned>(ISD::AND);
17097 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17098 DAG.getConstant(CC0, dl, MVT::i8));
17099 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17100 DAG.getConstant(CC1, dl, MVT::i8));
17101 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17103 // Handle all other FP comparisons here.
17104 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17105 DAG.getConstant(SSECC, dl, MVT::i8));
17108 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17109 // result type of SETCC. The bitcast is expected to be optimized away
17110 // during combining/isel.
17111 if (Opc == X86ISD::CMPP)
17112 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17117 MVT VTOp0 = Op0.getSimpleValueType();
17118 assert(VTOp0 == Op1.getSimpleValueType() &&
17119 "Expected operands with same type!");
17120 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17121 "Invalid number of packed elements for source and destination!");
17123 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17124 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17125 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17126 // legalizer firstly checks if the first operand in input to the setcc has
17127 // a legal type. If so, then it promotes the return type to that same type.
17128 // Otherwise, the return type is promoted to the 'next legal type' which,
17129 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17131 // We reach this code only if the following two conditions are met:
17132 // 1. Both return type and operand type have been promoted to wider types
17133 // by the type legalizer.
17134 // 2. The original operand type has been promoted to a 256-bit vector.
17136 // Note that condition 2. only applies for AVX targets.
17137 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17138 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17141 // The non-AVX512 code below works under the assumption that source and
17142 // destination types are the same.
17143 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17144 "Value types for source and destination must be the same!");
17146 // Break 256-bit integer vector compare into smaller ones.
17147 if (VT.is256BitVector() && !Subtarget.hasInt256())
17148 return Lower256IntVSETCC(Op, DAG);
17150 // Operands are boolean (vectors of i1)
17151 MVT OpVT = Op1.getSimpleValueType();
17152 if (OpVT.getVectorElementType() == MVT::i1)
17153 return LowerBoolVSETCC_AVX512(Op, DAG);
17155 // The result is boolean, but operands are int/float
17156 if (VT.getVectorElementType() == MVT::i1) {
17157 // In AVX-512 architecture setcc returns mask with i1 elements,
17158 // But there is no compare instruction for i8 and i16 elements in KNL.
17159 // In this case use SSE compare
17160 bool UseAVX512Inst =
17161 (OpVT.is512BitVector() ||
17162 OpVT.getScalarSizeInBits() >= 32 ||
17163 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17166 return LowerIntVSETCC_AVX512(Op, DAG);
17168 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17169 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17172 // Lower using XOP integer comparisons.
17173 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17174 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17175 // Translate compare code to XOP PCOM compare mode.
17176 unsigned CmpMode = 0;
17177 switch (SetCCOpcode) {
17178 default: llvm_unreachable("Unexpected SETCC condition");
17180 case ISD::SETLT: CmpMode = 0x00; break;
17182 case ISD::SETLE: CmpMode = 0x01; break;
17184 case ISD::SETGT: CmpMode = 0x02; break;
17186 case ISD::SETGE: CmpMode = 0x03; break;
17187 case ISD::SETEQ: CmpMode = 0x04; break;
17188 case ISD::SETNE: CmpMode = 0x05; break;
17191 // Are we comparing unsigned or signed integers?
17192 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17193 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17195 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17196 DAG.getConstant(CmpMode, dl, MVT::i8));
17199 // We are handling one of the integer comparisons here. Since SSE only has
17200 // GT and EQ comparisons for integer, swapping operands and multiple
17201 // operations may be required for some comparisons.
17203 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17204 bool Subus = false;
17206 switch (SetCCOpcode) {
17207 default: llvm_unreachable("Unexpected SETCC condition");
17208 case ISD::SETNE: Invert = true;
17209 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17210 case ISD::SETLT: Swap = true;
17211 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17212 case ISD::SETGE: Swap = true;
17213 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17214 Invert = true; break;
17215 case ISD::SETULT: Swap = true;
17216 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17217 FlipSigns = true; break;
17218 case ISD::SETUGE: Swap = true;
17219 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17220 FlipSigns = true; Invert = true; break;
17223 // Special case: Use min/max operations for SETULE/SETUGE
17224 MVT VET = VT.getVectorElementType();
17226 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17227 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17230 switch (SetCCOpcode) {
17232 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17233 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17236 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17239 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17240 if (!MinMax && hasSubus) {
17241 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17243 // t = psubus Op0, Op1
17244 // pcmpeq t, <0..0>
17245 switch (SetCCOpcode) {
17247 case ISD::SETULT: {
17248 // If the comparison is against a constant we can turn this into a
17249 // setule. With psubus, setule does not require a swap. This is
17250 // beneficial because the constant in the register is no longer
17251 // destructed as the destination so it can be hoisted out of a loop.
17252 // Only do this pre-AVX since vpcmp* is no longer destructive.
17253 if (Subtarget.hasAVX())
17255 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17257 Subus = true; Invert = false; Swap = false;
17261 // Psubus is better than flip-sign because it requires no inversion.
17262 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17263 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17267 Opc = X86ISD::SUBUS;
17273 std::swap(Op0, Op1);
17275 // Check that the operation in question is available (most are plain SSE2,
17276 // but PCMPGTQ and PCMPEQQ have different requirements).
17277 if (VT == MVT::v2i64) {
17278 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17279 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17281 // First cast everything to the right type.
17282 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17283 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17285 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17286 // bits of the inputs before performing those operations. The lower
17287 // compare is always unsigned.
17290 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17292 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17293 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17294 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17296 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17297 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17299 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17300 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17301 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17303 // Create masks for only the low parts/high parts of the 64 bit integers.
17304 static const int MaskHi[] = { 1, 1, 3, 3 };
17305 static const int MaskLo[] = { 0, 0, 2, 2 };
17306 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17307 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17308 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17310 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17311 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17314 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17316 return DAG.getBitcast(VT, Result);
17319 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17320 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17321 // pcmpeqd + pshufd + pand.
17322 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17324 // First cast everything to the right type.
17325 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17326 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17329 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17331 // Make sure the lower and upper halves are both all-ones.
17332 static const int Mask[] = { 1, 0, 3, 2 };
17333 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17334 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17337 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17339 return DAG.getBitcast(VT, Result);
17343 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17344 // bits of the inputs before performing those operations.
17346 MVT EltVT = VT.getVectorElementType();
17347 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17349 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17350 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17353 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17355 // If the logical-not of the result is required, perform that now.
17357 Result = DAG.getNOT(dl, Result, VT);
17360 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17363 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17364 getZeroVector(VT, Subtarget, DAG, dl));
17369 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17371 MVT VT = Op.getSimpleValueType();
17373 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17375 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
17376 && "SetCC type must be 8-bit or 1-bit integer");
17377 SDValue Op0 = Op.getOperand(0);
17378 SDValue Op1 = Op.getOperand(1);
17380 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17382 // Optimize to BT if possible.
17383 // Lower (X & (1 << N)) == 0 to BT(X, N).
17384 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17385 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17386 // Lower (trunc (X >> N) to i1) to BT(X, N).
17387 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17388 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17389 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17391 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17396 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17398 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17399 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17401 // If the input is a setcc, then reuse the input setcc or use a new one with
17402 // the inverted condition.
17403 if (Op0.getOpcode() == X86ISD::SETCC) {
17404 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17405 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17409 CCode = X86::GetOppositeBranchCondition(CCode);
17410 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17412 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17416 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17417 if (isOneConstant(Op1)) {
17418 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17419 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17421 if (!isNullConstant(Op1)) {
17422 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17423 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17427 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17428 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17429 if (X86CC == X86::COND_INVALID)
17432 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17433 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17434 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17436 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17440 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17441 SDValue LHS = Op.getOperand(0);
17442 SDValue RHS = Op.getOperand(1);
17443 SDValue Carry = Op.getOperand(2);
17444 SDValue Cond = Op.getOperand(3);
17447 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17448 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17450 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17451 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17452 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17453 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17454 if (Op.getSimpleValueType() == MVT::i1)
17455 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17459 /// Return true if opcode is a X86 logical comparison.
17460 static bool isX86LogicalCmp(SDValue Op) {
17461 unsigned Opc = Op.getOpcode();
17462 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17463 Opc == X86ISD::SAHF)
17465 if (Op.getResNo() == 1 &&
17466 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17467 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17468 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17469 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17472 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17478 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17479 if (V.getOpcode() != ISD::TRUNCATE)
17482 SDValue VOp0 = V.getOperand(0);
17483 unsigned InBits = VOp0.getValueSizeInBits();
17484 unsigned Bits = V.getValueSizeInBits();
17485 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17488 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17489 bool AddTest = true;
17490 SDValue Cond = Op.getOperand(0);
17491 SDValue Op1 = Op.getOperand(1);
17492 SDValue Op2 = Op.getOperand(2);
17494 MVT VT = Op1.getSimpleValueType();
17497 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17498 // are available or VBLENDV if AVX is available.
17499 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17500 if (Cond.getOpcode() == ISD::SETCC &&
17501 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17502 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17503 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17504 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17505 int SSECC = translateX86FSETCC(
17506 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17509 if (Subtarget.hasAVX512()) {
17510 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
17511 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17512 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17513 DL, VT, Cmp, Op1, Op2);
17516 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17517 DAG.getConstant(SSECC, DL, MVT::i8));
17519 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17520 // of 3 logic instructions for size savings and potentially speed.
17521 // Unfortunately, there is no scalar form of VBLENDV.
17523 // If either operand is a constant, don't try this. We can expect to
17524 // optimize away at least one of the logic instructions later in that
17525 // case, so that sequence would be faster than a variable blend.
17527 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17528 // uses XMM0 as the selection register. That may need just as many
17529 // instructions as the AND/ANDN/OR sequence due to register moves, so
17532 if (Subtarget.hasAVX() &&
17533 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17535 // Convert to vectors, do a VSELECT, and convert back to scalar.
17536 // All of the conversions should be optimized away.
17538 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17539 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17540 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17541 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17543 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17544 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17546 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
17548 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17549 VSel, DAG.getIntPtrConstant(0, DL));
17551 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17552 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17553 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17557 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17558 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
17559 Subtarget.hasAVX512())
17560 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
17562 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17564 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17565 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17566 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17567 Op1Scalar = Op1.getOperand(0);
17569 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17570 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17571 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17572 Op2Scalar = Op2.getOperand(0);
17573 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17574 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
17575 Op1Scalar.getValueType(),
17576 Cond, Op1Scalar, Op2Scalar);
17577 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17578 return DAG.getBitcast(VT, newSelect);
17579 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17580 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17581 DAG.getIntPtrConstant(0, DL));
17585 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17586 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17587 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17588 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17589 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17590 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17591 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
17593 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17596 if (Cond.getOpcode() == ISD::SETCC) {
17597 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17599 // If the condition was updated, it's possible that the operands of the
17600 // select were also updated (for example, EmitTest has a RAUW). Refresh
17601 // the local references to the select operands in case they got stale.
17602 Op1 = Op.getOperand(1);
17603 Op2 = Op.getOperand(2);
17607 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17608 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17609 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17610 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17611 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17612 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17613 if (Cond.getOpcode() == X86ISD::SETCC &&
17614 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17615 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17616 SDValue Cmp = Cond.getOperand(1);
17617 unsigned CondCode =
17618 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17620 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17621 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17622 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17624 SDValue CmpOp0 = Cmp.getOperand(0);
17625 // Apply further optimizations for special cases
17626 // (select (x != 0), -1, 0) -> neg & sbb
17627 // (select (x == 0), 0, -1) -> neg & sbb
17628 if (isNullConstant(Y) &&
17629 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17630 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17631 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17632 DAG.getConstant(0, DL,
17633 CmpOp0.getValueType()),
17635 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17636 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17637 SDValue(Neg.getNode(), 1));
17641 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17642 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17643 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17645 SDValue Res = // Res = 0 or -1.
17646 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17647 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17649 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17650 Res = DAG.getNOT(DL, Res, Res.getValueType());
17652 if (!isNullConstant(Op2))
17653 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17655 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17656 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17657 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17658 SDValue CmpOp0 = Cmp.getOperand(0);
17659 SDValue Src1, Src2;
17660 // true if Op2 is XOR or OR operator and one of its operands
17662 // ( a , a op b) || ( b , a op b)
17663 auto isOrXorPattern = [&]() {
17664 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17665 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17667 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17674 if (isOrXorPattern()) {
17676 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17677 // we need mask of all zeros or ones with same size of the other
17679 if (CmpSz > VT.getSizeInBits())
17680 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17681 else if (CmpSz < VT.getSizeInBits())
17682 Neg = DAG.getNode(ISD::AND, DL, VT,
17683 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17684 DAG.getConstant(1, DL, VT));
17687 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17688 Neg); // -(and (x, 0x1))
17689 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17690 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17695 // Look past (and (setcc_carry (cmp ...)), 1).
17696 if (Cond.getOpcode() == ISD::AND &&
17697 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17698 isOneConstant(Cond.getOperand(1)))
17699 Cond = Cond.getOperand(0);
17701 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17702 // setting operand in place of the X86ISD::SETCC.
17703 unsigned CondOpcode = Cond.getOpcode();
17704 if (CondOpcode == X86ISD::SETCC ||
17705 CondOpcode == X86ISD::SETCC_CARRY) {
17706 CC = Cond.getOperand(0);
17708 SDValue Cmp = Cond.getOperand(1);
17709 unsigned Opc = Cmp.getOpcode();
17710 MVT VT = Op.getSimpleValueType();
17712 bool IllegalFPCMov = false;
17713 if (VT.isFloatingPoint() && !VT.isVector() &&
17714 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17715 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17717 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17718 Opc == X86ISD::BT) { // FIXME
17722 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17723 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17724 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17725 Cond.getOperand(0).getValueType() != MVT::i8)) {
17726 SDValue LHS = Cond.getOperand(0);
17727 SDValue RHS = Cond.getOperand(1);
17728 unsigned X86Opcode;
17731 switch (CondOpcode) {
17732 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17733 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17734 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17735 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17736 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17737 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17738 default: llvm_unreachable("unexpected overflowing operator");
17740 if (CondOpcode == ISD::UMULO)
17741 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17744 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17746 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17748 if (CondOpcode == ISD::UMULO)
17749 Cond = X86Op.getValue(2);
17751 Cond = X86Op.getValue(1);
17753 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17758 // Look past the truncate if the high bits are known zero.
17759 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17760 Cond = Cond.getOperand(0);
17762 // We know the result of AND is compared against zero. Try to match
17764 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17765 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17766 CC = NewSetCC.getOperand(0);
17767 Cond = NewSetCC.getOperand(1);
17774 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17775 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17778 // a < b ? -1 : 0 -> RES = ~setcc_carry
17779 // a < b ? 0 : -1 -> RES = setcc_carry
17780 // a >= b ? -1 : 0 -> RES = setcc_carry
17781 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17782 if (Cond.getOpcode() == X86ISD::SUB) {
17783 Cond = ConvertCmpIfNecessary(Cond, DAG);
17784 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17786 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17787 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17788 (isNullConstant(Op1) || isNullConstant(Op2))) {
17789 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17790 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17792 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17793 return DAG.getNOT(DL, Res, Res.getValueType());
17798 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17799 // widen the cmov and push the truncate through. This avoids introducing a new
17800 // branch during isel and doesn't add any extensions.
17801 if (Op.getValueType() == MVT::i8 &&
17802 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17803 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17804 if (T1.getValueType() == T2.getValueType() &&
17805 // Blacklist CopyFromReg to avoid partial register stalls.
17806 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17807 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17808 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17809 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17813 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17814 // condition is true.
17815 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17816 SDValue Ops[] = { Op2, Op1, CC, Cond };
17817 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17820 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17821 const X86Subtarget &Subtarget,
17822 SelectionDAG &DAG) {
17823 MVT VT = Op->getSimpleValueType(0);
17824 SDValue In = Op->getOperand(0);
17825 MVT InVT = In.getSimpleValueType();
17826 MVT VTElt = VT.getVectorElementType();
17827 MVT InVTElt = InVT.getVectorElementType();
17831 if ((InVTElt == MVT::i1) &&
17832 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17834 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17836 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17838 unsigned NumElts = VT.getVectorNumElements();
17840 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17841 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17842 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17843 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17844 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17847 if (InVTElt != MVT::i1)
17851 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17852 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17855 if (Subtarget.hasDQI()) {
17856 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17857 assert(!VT.is512BitVector() && "Unexpected vector type");
17859 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17860 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17861 V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17866 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17869 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17870 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17871 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17872 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17873 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17874 const X86Subtarget &Subtarget,
17875 SelectionDAG &DAG) {
17876 SDValue In = Op->getOperand(0);
17877 MVT VT = Op->getSimpleValueType(0);
17878 MVT InVT = In.getSimpleValueType();
17879 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17881 MVT SVT = VT.getVectorElementType();
17882 MVT InSVT = InVT.getVectorElementType();
17883 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17885 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17887 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17889 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17890 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17891 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17896 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17897 // For 512-bit vectors, we need 128-bits or 256-bits.
17898 if (VT.getSizeInBits() > 128) {
17899 // Input needs to be at least the same number of elements as output, and
17900 // at least 128-bits.
17901 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17902 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17905 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17906 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17908 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17909 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17910 // need to be handled here for 256/512-bit results.
17911 if (Subtarget.hasInt256()) {
17912 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17913 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17914 X86ISD::VSEXT : X86ISD::VZEXT;
17915 return DAG.getNode(ExtOpc, dl, VT, In);
17918 // We should only get here for sign extend.
17919 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17920 "Unexpected opcode!");
17922 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17926 // As SRAI is only available on i16/i32 types, we expand only up to i32
17927 // and handle i64 separately.
17928 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17929 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17930 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17931 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17932 Curr = DAG.getBitcast(CurrVT, Curr);
17935 SDValue SignExt = Curr;
17936 if (CurrVT != InVT) {
17937 unsigned SignExtShift =
17938 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17939 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17940 DAG.getConstant(SignExtShift, dl, MVT::i8));
17946 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17947 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17948 DAG.getConstant(31, dl, MVT::i8));
17949 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17950 return DAG.getBitcast(VT, Ext);
17956 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17957 SelectionDAG &DAG) {
17958 MVT VT = Op->getSimpleValueType(0);
17959 SDValue In = Op->getOperand(0);
17960 MVT InVT = In.getSimpleValueType();
17963 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17964 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17966 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17967 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17968 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17971 if (Subtarget.hasInt256())
17972 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17974 // Optimize vectors in AVX mode
17975 // Sign extend v8i16 to v8i32 and
17978 // Divide input vector into two parts
17979 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17980 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17981 // concat the vectors to original VT
17983 unsigned NumElems = InVT.getVectorNumElements();
17984 SDValue Undef = DAG.getUNDEF(InVT);
17986 SmallVector<int,8> ShufMask1(NumElems, -1);
17987 for (unsigned i = 0; i != NumElems/2; ++i)
17990 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17992 SmallVector<int,8> ShufMask2(NumElems, -1);
17993 for (unsigned i = 0; i != NumElems/2; ++i)
17994 ShufMask2[i] = i + NumElems/2;
17996 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17998 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17999 VT.getVectorNumElements() / 2);
18001 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18002 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18004 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18007 // Lower truncating store. We need a special lowering to vXi1 vectors
18008 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18009 SelectionDAG &DAG) {
18010 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18012 EVT MemVT = St->getMemoryVT();
18013 assert(St->isTruncatingStore() && "We only custom truncating store.");
18014 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18015 "Expected truncstore of i1 vector");
18017 SDValue Op = St->getValue();
18018 MVT OpVT = Op.getValueType().getSimpleVT();
18019 unsigned NumElts = OpVT.getVectorNumElements();
18020 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18022 // Truncate and store - everything is legal
18023 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18024 if (MemVT.getSizeInBits() < 8)
18025 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18026 DAG.getUNDEF(MVT::v8i1), Op,
18027 DAG.getIntPtrConstant(0, dl));
18028 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18029 St->getMemOperand());
18032 // A subset, assume that we have only AVX-512F
18033 if (NumElts <= 8) {
18035 // Extend to 8-elts vector
18036 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18037 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18038 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18040 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18041 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18042 St->getMemOperand());
18045 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18046 // Divide the vector into 2 parts and store each part separately
18047 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18048 DAG.getIntPtrConstant(0, dl));
18049 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18050 SDValue BasePtr = St->getBasePtr();
18051 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18052 St->getMemOperand());
18053 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18054 DAG.getIntPtrConstant(16, dl));
18055 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18057 SDValue BasePtrHi =
18058 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18059 DAG.getConstant(2, dl, BasePtr.getValueType()));
18061 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18062 BasePtrHi, St->getMemOperand());
18063 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18066 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18067 const X86Subtarget &Subtarget,
18068 SelectionDAG &DAG) {
18070 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18072 EVT MemVT = Ld->getMemoryVT();
18073 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18074 "Expected i1 vector load");
18075 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18076 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18077 MVT VT = Op.getValueType().getSimpleVT();
18078 unsigned NumElts = VT.getVectorNumElements();
18080 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18081 (Subtarget.hasDQI() && NumElts < 16) ||
18083 // Load and extend - everything is legal
18085 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18087 Ld->getMemOperand());
18088 // Replace chain users with the new chain.
18089 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18090 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18091 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18092 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18094 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18095 DAG.getIntPtrConstant(0, dl));
18097 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18099 Ld->getMemOperand());
18100 // Replace chain users with the new chain.
18101 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18102 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18104 // Finally, do a normal sign-extend to the desired register.
18105 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18108 if (NumElts <= 8) {
18109 // A subset, assume that we have only AVX-512F
18110 unsigned NumBitsToLoad = 8;
18111 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18112 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18114 Ld->getMemOperand());
18115 // Replace chain users with the new chain.
18116 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18117 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18119 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18120 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18123 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18125 // we should take care to v4i1 and v2i1
18127 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18128 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18129 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18130 DAG.getIntPtrConstant(0, dl));
18133 assert(VT == MVT::v32i8 && "Unexpected extload type");
18135 SmallVector<SDValue, 2> Chains;
18137 SDValue BasePtr = Ld->getBasePtr();
18138 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18140 Ld->getMemOperand());
18141 Chains.push_back(LoadLo.getValue(1));
18143 SDValue BasePtrHi =
18144 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18145 DAG.getConstant(2, dl, BasePtr.getValueType()));
18147 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18149 Ld->getMemOperand());
18150 Chains.push_back(LoadHi.getValue(1));
18151 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18152 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18154 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18155 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18156 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18159 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18160 // may emit an illegal shuffle but the expansion is still better than scalar
18161 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18162 // we'll emit a shuffle and a arithmetic shift.
18163 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18164 // TODO: It is possible to support ZExt by zeroing the undef values during
18165 // the shuffle phase or after the shuffle.
18166 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18167 SelectionDAG &DAG) {
18168 MVT RegVT = Op.getSimpleValueType();
18169 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18170 assert(RegVT.isInteger() &&
18171 "We only custom lower integer vector sext loads.");
18173 // Nothing useful we can do without SSE2 shuffles.
18174 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18176 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18178 EVT MemVT = Ld->getMemoryVT();
18179 if (MemVT.getScalarType() == MVT::i1)
18180 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18183 unsigned RegSz = RegVT.getSizeInBits();
18185 ISD::LoadExtType Ext = Ld->getExtensionType();
18187 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18188 && "Only anyext and sext are currently implemented.");
18189 assert(MemVT != RegVT && "Cannot extend to the same type");
18190 assert(MemVT.isVector() && "Must load a vector from memory");
18192 unsigned NumElems = RegVT.getVectorNumElements();
18193 unsigned MemSz = MemVT.getSizeInBits();
18194 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18196 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18197 // The only way in which we have a legal 256-bit vector result but not the
18198 // integer 256-bit operations needed to directly lower a sextload is if we
18199 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18200 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18201 // correctly legalized. We do this late to allow the canonical form of
18202 // sextload to persist throughout the rest of the DAG combiner -- it wants
18203 // to fold together any extensions it can, and so will fuse a sign_extend
18204 // of an sextload into a sextload targeting a wider value.
18206 if (MemSz == 128) {
18207 // Just switch this to a normal load.
18208 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18209 "it must be a legal 128-bit vector "
18211 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18212 Ld->getPointerInfo(), Ld->getAlignment(),
18213 Ld->getMemOperand()->getFlags());
18215 assert(MemSz < 128 &&
18216 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18217 // Do an sext load to a 128-bit vector type. We want to use the same
18218 // number of elements, but elements half as wide. This will end up being
18219 // recursively lowered by this routine, but will succeed as we definitely
18220 // have all the necessary features if we're using AVX1.
18222 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18223 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18225 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18226 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18227 Ld->getMemOperand()->getFlags());
18230 // Replace chain users with the new chain.
18231 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18232 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18234 // Finally, do a normal sign-extend to the desired register.
18235 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18238 // All sizes must be a power of two.
18239 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18240 "Non-power-of-two elements are not custom lowered!");
18242 // Attempt to load the original value using scalar loads.
18243 // Find the largest scalar type that divides the total loaded size.
18244 MVT SclrLoadTy = MVT::i8;
18245 for (MVT Tp : MVT::integer_valuetypes()) {
18246 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18251 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18252 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18254 SclrLoadTy = MVT::f64;
18256 // Calculate the number of scalar loads that we need to perform
18257 // in order to load our vector from memory.
18258 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18260 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18261 "Can only lower sext loads with a single scalar load!");
18263 unsigned loadRegZize = RegSz;
18264 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18267 // Represent our vector as a sequence of elements which are the
18268 // largest scalar that we can load.
18269 EVT LoadUnitVecVT = EVT::getVectorVT(
18270 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18272 // Represent the data using the same element type that is stored in
18273 // memory. In practice, we ''widen'' MemVT.
18275 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18276 loadRegZize / MemVT.getScalarSizeInBits());
18278 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18279 "Invalid vector type");
18281 // We can't shuffle using an illegal type.
18282 assert(TLI.isTypeLegal(WideVecVT) &&
18283 "We only lower types that form legal widened vector types");
18285 SmallVector<SDValue, 8> Chains;
18286 SDValue Ptr = Ld->getBasePtr();
18287 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18288 TLI.getPointerTy(DAG.getDataLayout()));
18289 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18291 for (unsigned i = 0; i < NumLoads; ++i) {
18292 // Perform a single load.
18293 SDValue ScalarLoad =
18294 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18295 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18296 Chains.push_back(ScalarLoad.getValue(1));
18297 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18298 // another round of DAGCombining.
18300 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18302 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18303 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18305 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18308 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18310 // Bitcast the loaded value to a vector of the original element type, in
18311 // the size of the target vector type.
18312 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18313 unsigned SizeRatio = RegSz / MemSz;
18315 if (Ext == ISD::SEXTLOAD) {
18316 // If we have SSE4.1, we can directly emit a VSEXT node.
18317 if (Subtarget.hasSSE41()) {
18318 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18319 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18323 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18325 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18326 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18328 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18329 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18333 // Redistribute the loaded elements into the different locations.
18334 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18335 for (unsigned i = 0; i != NumElems; ++i)
18336 ShuffleVec[i * SizeRatio] = i;
18338 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18339 DAG.getUNDEF(WideVecVT), ShuffleVec);
18341 // Bitcast to the requested type.
18342 Shuff = DAG.getBitcast(RegVT, Shuff);
18343 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18347 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18348 /// each of which has no other use apart from the AND / OR.
18349 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18350 Opc = Op.getOpcode();
18351 if (Opc != ISD::OR && Opc != ISD::AND)
18353 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18354 Op.getOperand(0).hasOneUse() &&
18355 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18356 Op.getOperand(1).hasOneUse());
18359 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18360 /// SETCC node has a single use.
18361 static bool isXor1OfSetCC(SDValue Op) {
18362 if (Op.getOpcode() != ISD::XOR)
18364 if (isOneConstant(Op.getOperand(1)))
18365 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18366 Op.getOperand(0).hasOneUse();
18370 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18371 bool addTest = true;
18372 SDValue Chain = Op.getOperand(0);
18373 SDValue Cond = Op.getOperand(1);
18374 SDValue Dest = Op.getOperand(2);
18377 bool Inverted = false;
18379 if (Cond.getOpcode() == ISD::SETCC) {
18380 // Check for setcc([su]{add,sub,mul}o == 0).
18381 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18382 isNullConstant(Cond.getOperand(1)) &&
18383 Cond.getOperand(0).getResNo() == 1 &&
18384 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18385 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18386 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18387 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18388 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18389 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18391 Cond = Cond.getOperand(0);
18393 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18398 // FIXME: LowerXALUO doesn't handle these!!
18399 else if (Cond.getOpcode() == X86ISD::ADD ||
18400 Cond.getOpcode() == X86ISD::SUB ||
18401 Cond.getOpcode() == X86ISD::SMUL ||
18402 Cond.getOpcode() == X86ISD::UMUL)
18403 Cond = LowerXALUO(Cond, DAG);
18406 // Look pass (and (setcc_carry (cmp ...)), 1).
18407 if (Cond.getOpcode() == ISD::AND &&
18408 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18409 isOneConstant(Cond.getOperand(1)))
18410 Cond = Cond.getOperand(0);
18412 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18413 // setting operand in place of the X86ISD::SETCC.
18414 unsigned CondOpcode = Cond.getOpcode();
18415 if (CondOpcode == X86ISD::SETCC ||
18416 CondOpcode == X86ISD::SETCC_CARRY) {
18417 CC = Cond.getOperand(0);
18419 SDValue Cmp = Cond.getOperand(1);
18420 unsigned Opc = Cmp.getOpcode();
18421 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18422 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18426 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18430 // These can only come from an arithmetic instruction with overflow,
18431 // e.g. SADDO, UADDO.
18432 Cond = Cond.getOperand(1);
18438 CondOpcode = Cond.getOpcode();
18439 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18440 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18441 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18442 Cond.getOperand(0).getValueType() != MVT::i8)) {
18443 SDValue LHS = Cond.getOperand(0);
18444 SDValue RHS = Cond.getOperand(1);
18445 unsigned X86Opcode;
18448 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18449 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18451 switch (CondOpcode) {
18452 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18454 if (isOneConstant(RHS)) {
18455 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18458 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18459 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18461 if (isOneConstant(RHS)) {
18462 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18465 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18466 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18467 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18468 default: llvm_unreachable("unexpected overflowing operator");
18471 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18472 if (CondOpcode == ISD::UMULO)
18473 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18476 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18478 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18480 if (CondOpcode == ISD::UMULO)
18481 Cond = X86Op.getValue(2);
18483 Cond = X86Op.getValue(1);
18485 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18489 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18490 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18491 if (CondOpc == ISD::OR) {
18492 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18493 // two branches instead of an explicit OR instruction with a
18495 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18496 isX86LogicalCmp(Cmp)) {
18497 CC = Cond.getOperand(0).getOperand(0);
18498 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18499 Chain, Dest, CC, Cmp);
18500 CC = Cond.getOperand(1).getOperand(0);
18504 } else { // ISD::AND
18505 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18506 // two branches instead of an explicit AND instruction with a
18507 // separate test. However, we only do this if this block doesn't
18508 // have a fall-through edge, because this requires an explicit
18509 // jmp when the condition is false.
18510 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18511 isX86LogicalCmp(Cmp) &&
18512 Op.getNode()->hasOneUse()) {
18513 X86::CondCode CCode =
18514 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18515 CCode = X86::GetOppositeBranchCondition(CCode);
18516 CC = DAG.getConstant(CCode, dl, MVT::i8);
18517 SDNode *User = *Op.getNode()->use_begin();
18518 // Look for an unconditional branch following this conditional branch.
18519 // We need this because we need to reverse the successors in order
18520 // to implement FCMP_OEQ.
18521 if (User->getOpcode() == ISD::BR) {
18522 SDValue FalseBB = User->getOperand(1);
18524 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18525 assert(NewBR == User);
18529 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18530 Chain, Dest, CC, Cmp);
18531 X86::CondCode CCode =
18532 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18533 CCode = X86::GetOppositeBranchCondition(CCode);
18534 CC = DAG.getConstant(CCode, dl, MVT::i8);
18540 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18541 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18542 // It should be transformed during dag combiner except when the condition
18543 // is set by a arithmetics with overflow node.
18544 X86::CondCode CCode =
18545 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18546 CCode = X86::GetOppositeBranchCondition(CCode);
18547 CC = DAG.getConstant(CCode, dl, MVT::i8);
18548 Cond = Cond.getOperand(0).getOperand(1);
18550 } else if (Cond.getOpcode() == ISD::SETCC &&
18551 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18552 // For FCMP_OEQ, we can emit
18553 // two branches instead of an explicit AND instruction with a
18554 // separate test. However, we only do this if this block doesn't
18555 // have a fall-through edge, because this requires an explicit
18556 // jmp when the condition is false.
18557 if (Op.getNode()->hasOneUse()) {
18558 SDNode *User = *Op.getNode()->use_begin();
18559 // Look for an unconditional branch following this conditional branch.
18560 // We need this because we need to reverse the successors in order
18561 // to implement FCMP_OEQ.
18562 if (User->getOpcode() == ISD::BR) {
18563 SDValue FalseBB = User->getOperand(1);
18565 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18566 assert(NewBR == User);
18570 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18571 Cond.getOperand(0), Cond.getOperand(1));
18572 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18573 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18574 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18575 Chain, Dest, CC, Cmp);
18576 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18581 } else if (Cond.getOpcode() == ISD::SETCC &&
18582 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18583 // For FCMP_UNE, we can emit
18584 // two branches instead of an explicit AND instruction with a
18585 // separate test. However, we only do this if this block doesn't
18586 // have a fall-through edge, because this requires an explicit
18587 // jmp when the condition is false.
18588 if (Op.getNode()->hasOneUse()) {
18589 SDNode *User = *Op.getNode()->use_begin();
18590 // Look for an unconditional branch following this conditional branch.
18591 // We need this because we need to reverse the successors in order
18592 // to implement FCMP_UNE.
18593 if (User->getOpcode() == ISD::BR) {
18594 SDValue FalseBB = User->getOperand(1);
18596 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18597 assert(NewBR == User);
18600 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18601 Cond.getOperand(0), Cond.getOperand(1));
18602 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18603 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18604 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18605 Chain, Dest, CC, Cmp);
18606 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18616 // Look pass the truncate if the high bits are known zero.
18617 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18618 Cond = Cond.getOperand(0);
18620 // We know the result is compared against zero. Try to match it to BT.
18621 if (Cond.hasOneUse()) {
18622 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18623 CC = NewSetCC.getOperand(0);
18624 Cond = NewSetCC.getOperand(1);
18631 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18632 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18633 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18635 Cond = ConvertCmpIfNecessary(Cond, DAG);
18636 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18637 Chain, Dest, CC, Cond);
18640 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18641 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18642 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18643 // that the guard pages used by the OS virtual memory manager are allocated in
18644 // correct sequence.
18646 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18647 SelectionDAG &DAG) const {
18648 MachineFunction &MF = DAG.getMachineFunction();
18649 bool SplitStack = MF.shouldSplitStack();
18650 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18655 SDNode *Node = Op.getNode();
18656 SDValue Chain = Op.getOperand(0);
18657 SDValue Size = Op.getOperand(1);
18658 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18659 EVT VT = Node->getValueType(0);
18661 // Chain the dynamic stack allocation so that it doesn't modify the stack
18662 // pointer when other instructions are using the stack.
18663 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18665 bool Is64Bit = Subtarget.is64Bit();
18666 MVT SPTy = getPointerTy(DAG.getDataLayout());
18670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18671 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18672 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18673 " not tell us which reg is the stack pointer!");
18675 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18676 Chain = SP.getValue(1);
18677 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18678 unsigned StackAlign = TFI.getStackAlignment();
18679 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18680 if (Align > StackAlign)
18681 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18682 DAG.getConstant(-(uint64_t)Align, dl, VT));
18683 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18684 } else if (SplitStack) {
18685 MachineRegisterInfo &MRI = MF.getRegInfo();
18688 // The 64 bit implementation of segmented stacks needs to clobber both r10
18689 // r11. This makes it impossible to use it along with nested parameters.
18690 const Function *F = MF.getFunction();
18691 for (const auto &A : F->args()) {
18692 if (A.hasNestAttr())
18693 report_fatal_error("Cannot use segmented stacks with functions that "
18694 "have nested arguments.");
18698 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18699 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18700 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18701 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18702 DAG.getRegister(Vreg, SPTy));
18704 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18705 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18706 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18708 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18709 unsigned SPReg = RegInfo->getStackRegister();
18710 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18711 Chain = SP.getValue(1);
18714 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18715 DAG.getConstant(-(uint64_t)Align, dl, VT));
18716 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18722 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18723 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18725 SDValue Ops[2] = {Result, Chain};
18726 return DAG.getMergeValues(Ops, dl);
18729 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18730 MachineFunction &MF = DAG.getMachineFunction();
18731 auto PtrVT = getPointerTy(MF.getDataLayout());
18732 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18734 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18737 if (!Subtarget.is64Bit() ||
18738 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18739 // vastart just stores the address of the VarArgsFrameIndex slot into the
18740 // memory location argument.
18741 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18742 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18743 MachinePointerInfo(SV));
18747 // gp_offset (0 - 6 * 8)
18748 // fp_offset (48 - 48 + 8 * 16)
18749 // overflow_arg_area (point to parameters coming in memory).
18751 SmallVector<SDValue, 8> MemOps;
18752 SDValue FIN = Op.getOperand(1);
18754 SDValue Store = DAG.getStore(
18755 Op.getOperand(0), DL,
18756 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18757 MachinePointerInfo(SV));
18758 MemOps.push_back(Store);
18761 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18762 Store = DAG.getStore(
18763 Op.getOperand(0), DL,
18764 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18765 MachinePointerInfo(SV, 4));
18766 MemOps.push_back(Store);
18768 // Store ptr to overflow_arg_area
18769 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18770 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18772 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18773 MemOps.push_back(Store);
18775 // Store ptr to reg_save_area.
18776 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18777 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18778 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18779 Store = DAG.getStore(
18780 Op.getOperand(0), DL, RSFIN, FIN,
18781 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18782 MemOps.push_back(Store);
18783 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18786 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18787 assert(Subtarget.is64Bit() &&
18788 "LowerVAARG only handles 64-bit va_arg!");
18789 assert(Op.getNumOperands() == 4);
18791 MachineFunction &MF = DAG.getMachineFunction();
18792 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18793 // The Win64 ABI uses char* instead of a structure.
18794 return DAG.expandVAArg(Op.getNode());
18796 SDValue Chain = Op.getOperand(0);
18797 SDValue SrcPtr = Op.getOperand(1);
18798 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18799 unsigned Align = Op.getConstantOperandVal(3);
18802 EVT ArgVT = Op.getNode()->getValueType(0);
18803 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18804 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18807 // Decide which area this value should be read from.
18808 // TODO: Implement the AMD64 ABI in its entirety. This simple
18809 // selection mechanism works only for the basic types.
18810 if (ArgVT == MVT::f80) {
18811 llvm_unreachable("va_arg for f80 not yet implemented");
18812 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18813 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18814 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18815 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18817 llvm_unreachable("Unhandled argument type in LowerVAARG");
18820 if (ArgMode == 2) {
18821 // Sanity Check: Make sure using fp_offset makes sense.
18822 assert(!Subtarget.useSoftFloat() &&
18823 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18824 Subtarget.hasSSE1());
18827 // Insert VAARG_64 node into the DAG
18828 // VAARG_64 returns two values: Variable Argument Address, Chain
18829 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18830 DAG.getConstant(ArgMode, dl, MVT::i8),
18831 DAG.getConstant(Align, dl, MVT::i32)};
18832 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18833 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18834 VTs, InstOps, MVT::i64,
18835 MachinePointerInfo(SV),
18837 /*Volatile=*/false,
18839 /*WriteMem=*/true);
18840 Chain = VAARG.getValue(1);
18842 // Load the next argument and return it
18843 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18846 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18847 SelectionDAG &DAG) {
18848 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18849 // where a va_list is still an i8*.
18850 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18851 if (Subtarget.isCallingConvWin64(
18852 DAG.getMachineFunction().getFunction()->getCallingConv()))
18853 // Probably a Win64 va_copy.
18854 return DAG.expandVACopy(Op.getNode());
18856 SDValue Chain = Op.getOperand(0);
18857 SDValue DstPtr = Op.getOperand(1);
18858 SDValue SrcPtr = Op.getOperand(2);
18859 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18860 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18863 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18864 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18866 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18869 /// Handle vector element shifts where the shift amount is a constant.
18870 /// Takes immediate version of shift as input.
18871 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18872 SDValue SrcOp, uint64_t ShiftAmt,
18873 SelectionDAG &DAG) {
18874 MVT ElementType = VT.getVectorElementType();
18876 // Bitcast the source vector to the output type, this is mainly necessary for
18877 // vXi8/vXi64 shifts.
18878 if (VT != SrcOp.getSimpleValueType())
18879 SrcOp = DAG.getBitcast(VT, SrcOp);
18881 // Fold this packed shift into its first operand if ShiftAmt is 0.
18885 // Check for ShiftAmt >= element width
18886 if (ShiftAmt >= ElementType.getSizeInBits()) {
18887 if (Opc == X86ISD::VSRAI)
18888 ShiftAmt = ElementType.getSizeInBits() - 1;
18890 return DAG.getConstant(0, dl, VT);
18893 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18894 && "Unknown target vector shift-by-constant node");
18896 // Fold this packed vector shift into a build vector if SrcOp is a
18897 // vector of Constants or UNDEFs.
18898 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18899 SmallVector<SDValue, 8> Elts;
18900 unsigned NumElts = SrcOp->getNumOperands();
18901 ConstantSDNode *ND;
18904 default: llvm_unreachable("Unknown opcode!");
18905 case X86ISD::VSHLI:
18906 for (unsigned i=0; i!=NumElts; ++i) {
18907 SDValue CurrentOp = SrcOp->getOperand(i);
18908 if (CurrentOp->isUndef()) {
18909 Elts.push_back(CurrentOp);
18912 ND = cast<ConstantSDNode>(CurrentOp);
18913 const APInt &C = ND->getAPIntValue();
18914 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18917 case X86ISD::VSRLI:
18918 for (unsigned i=0; i!=NumElts; ++i) {
18919 SDValue CurrentOp = SrcOp->getOperand(i);
18920 if (CurrentOp->isUndef()) {
18921 Elts.push_back(CurrentOp);
18924 ND = cast<ConstantSDNode>(CurrentOp);
18925 const APInt &C = ND->getAPIntValue();
18926 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18929 case X86ISD::VSRAI:
18930 for (unsigned i=0; i!=NumElts; ++i) {
18931 SDValue CurrentOp = SrcOp->getOperand(i);
18932 if (CurrentOp->isUndef()) {
18933 Elts.push_back(CurrentOp);
18936 ND = cast<ConstantSDNode>(CurrentOp);
18937 const APInt &C = ND->getAPIntValue();
18938 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18943 return DAG.getBuildVector(VT, dl, Elts);
18946 return DAG.getNode(Opc, dl, VT, SrcOp,
18947 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18950 /// Handle vector element shifts where the shift amount may or may not be a
18951 /// constant. Takes immediate version of shift as input.
18952 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18953 SDValue SrcOp, SDValue ShAmt,
18954 const X86Subtarget &Subtarget,
18955 SelectionDAG &DAG) {
18956 MVT SVT = ShAmt.getSimpleValueType();
18957 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18959 // Catch shift-by-constant.
18960 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18961 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18962 CShAmt->getZExtValue(), DAG);
18964 // Change opcode to non-immediate version
18966 default: llvm_unreachable("Unknown target vector shift node");
18967 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18968 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18969 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18972 // Need to build a vector containing shift amount.
18973 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18974 // +=================+============+=======================================+
18975 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18976 // +=================+============+=======================================+
18977 // | i64 | Yes, No | Use ShAmt as lowest elt |
18978 // | i32 | Yes | zero-extend in-reg |
18979 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18980 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18981 // +=================+============+=======================================+
18983 if (SVT == MVT::i64)
18984 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18985 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18986 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18987 ShAmt = ShAmt.getOperand(0);
18988 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18989 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18990 } else if (Subtarget.hasSSE41() &&
18991 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18992 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18993 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18995 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18996 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18997 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19000 // The return type has to be a 128-bit type with the same element
19001 // type as the input type.
19002 MVT EltVT = VT.getVectorElementType();
19003 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19005 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19006 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19009 /// \brief Return Mask with the necessary casting or extending
19010 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19011 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19012 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19015 if (isAllOnesConstant(Mask))
19016 return DAG.getTargetConstant(1, dl, MaskVT);
19017 if (X86::isZeroNode(Mask))
19018 return DAG.getTargetConstant(0, dl, MaskVT);
19020 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19021 // Mask should be extended
19022 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19023 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19026 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19027 if (MaskVT == MVT::v64i1) {
19028 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19029 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19031 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19032 DAG.getConstant(0, dl, MVT::i32));
19033 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19034 DAG.getConstant(1, dl, MVT::i32));
19036 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19037 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19039 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19041 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19043 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19044 return DAG.getBitcast(MaskVT,
19045 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19049 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19050 Mask.getSimpleValueType().getSizeInBits());
19051 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19052 // are extracted by EXTRACT_SUBVECTOR.
19053 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19054 DAG.getBitcast(BitcastVT, Mask),
19055 DAG.getIntPtrConstant(0, dl));
19059 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19060 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19061 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19062 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19063 SDValue PreservedSrc,
19064 const X86Subtarget &Subtarget,
19065 SelectionDAG &DAG) {
19066 MVT VT = Op.getSimpleValueType();
19067 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19068 unsigned OpcodeSelect = ISD::VSELECT;
19071 if (isAllOnesConstant(Mask))
19074 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19076 switch (Op.getOpcode()) {
19078 case X86ISD::PCMPEQM:
19079 case X86ISD::PCMPGTM:
19081 case X86ISD::CMPMU:
19082 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19083 case X86ISD::VFPCLASS:
19084 case X86ISD::VFPCLASSS:
19085 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19086 case X86ISD::VTRUNC:
19087 case X86ISD::VTRUNCS:
19088 case X86ISD::VTRUNCUS:
19089 case X86ISD::CVTPS2PH:
19090 // We can't use ISD::VSELECT here because it is not always "Legal"
19091 // for the destination type. For example vpmovqb require only AVX512
19092 // and vselect that can operate on byte element type require BWI
19093 OpcodeSelect = X86ISD::SELECT;
19096 if (PreservedSrc.isUndef())
19097 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19098 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19101 /// \brief Creates an SDNode for a predicated scalar operation.
19102 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19103 /// The mask is coming as MVT::i8 and it should be truncated
19104 /// to MVT::i1 while lowering masking intrinsics.
19105 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19106 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19107 /// for a scalar instruction.
19108 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19109 SDValue PreservedSrc,
19110 const X86Subtarget &Subtarget,
19111 SelectionDAG &DAG) {
19112 if (isAllOnesConstant(Mask))
19115 MVT VT = Op.getSimpleValueType();
19117 // The mask should be of type MVT::i1
19118 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
19120 if (Op.getOpcode() == X86ISD::FSETCCM ||
19121 Op.getOpcode() == X86ISD::FSETCCM_RND)
19122 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19123 if (Op.getOpcode() == X86ISD::VFPCLASS ||
19124 Op.getOpcode() == X86ISD::VFPCLASSS)
19125 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19127 if (PreservedSrc.isUndef())
19128 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19129 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19132 static int getSEHRegistrationNodeSize(const Function *Fn) {
19133 if (!Fn->hasPersonalityFn())
19134 report_fatal_error(
19135 "querying registration node size for function without personality");
19136 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19137 // WinEHStatePass for the full struct definition.
19138 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19139 case EHPersonality::MSVC_X86SEH: return 24;
19140 case EHPersonality::MSVC_CXX: return 16;
19143 report_fatal_error(
19144 "can only recover FP for 32-bit MSVC EH personality functions");
19147 /// When the MSVC runtime transfers control to us, either to an outlined
19148 /// function or when returning to a parent frame after catching an exception, we
19149 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19150 /// Here's the math:
19151 /// RegNodeBase = EntryEBP - RegNodeSize
19152 /// ParentFP = RegNodeBase - ParentFrameOffset
19153 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19154 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19155 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19156 SDValue EntryEBP) {
19157 MachineFunction &MF = DAG.getMachineFunction();
19160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19161 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19163 // It's possible that the parent function no longer has a personality function
19164 // if the exceptional code was optimized away, in which case we just return
19165 // the incoming EBP.
19166 if (!Fn->hasPersonalityFn())
19169 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19170 // registration, or the .set_setframe offset.
19171 MCSymbol *OffsetSym =
19172 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19173 GlobalValue::getRealLinkageName(Fn->getName()));
19174 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19175 SDValue ParentFrameOffset =
19176 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19178 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19179 // prologue to RBP in the parent function.
19180 const X86Subtarget &Subtarget =
19181 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19182 if (Subtarget.is64Bit())
19183 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19185 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19186 // RegNodeBase = EntryEBP - RegNodeSize
19187 // ParentFP = RegNodeBase - ParentFrameOffset
19188 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19189 DAG.getConstant(RegNodeSize, dl, PtrVT));
19190 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19193 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19194 SelectionDAG &DAG) {
19195 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19196 auto isRoundModeCurDirection = [](SDValue Rnd) {
19197 if (!isa<ConstantSDNode>(Rnd))
19200 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19201 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19205 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19206 MVT VT = Op.getSimpleValueType();
19207 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19209 switch(IntrData->Type) {
19210 case INTR_TYPE_1OP:
19211 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19212 case INTR_TYPE_2OP:
19213 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19215 case INTR_TYPE_3OP:
19216 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19217 Op.getOperand(2), Op.getOperand(3));
19218 case INTR_TYPE_4OP:
19219 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19220 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19221 case INTR_TYPE_1OP_MASK_RM: {
19222 SDValue Src = Op.getOperand(1);
19223 SDValue PassThru = Op.getOperand(2);
19224 SDValue Mask = Op.getOperand(3);
19225 SDValue RoundingMode;
19226 // We always add rounding mode to the Node.
19227 // If the rounding mode is not specified, we add the
19228 // "current direction" mode.
19229 if (Op.getNumOperands() == 4)
19231 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19233 RoundingMode = Op.getOperand(4);
19234 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19235 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19237 Mask, PassThru, Subtarget, DAG);
19239 case INTR_TYPE_1OP_MASK: {
19240 SDValue Src = Op.getOperand(1);
19241 SDValue PassThru = Op.getOperand(2);
19242 SDValue Mask = Op.getOperand(3);
19243 // We add rounding mode to the Node when
19244 // - RM Opcode is specified and
19245 // - RM is not "current direction".
19246 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19247 if (IntrWithRoundingModeOpcode != 0) {
19248 SDValue Rnd = Op.getOperand(4);
19249 if (!isRoundModeCurDirection(Rnd)) {
19250 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19251 dl, Op.getValueType(),
19253 Mask, PassThru, Subtarget, DAG);
19256 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19257 Mask, PassThru, Subtarget, DAG);
19259 case INTR_TYPE_SCALAR_MASK: {
19260 SDValue Src1 = Op.getOperand(1);
19261 SDValue Src2 = Op.getOperand(2);
19262 SDValue passThru = Op.getOperand(3);
19263 SDValue Mask = Op.getOperand(4);
19264 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19265 if (IntrWithRoundingModeOpcode != 0) {
19266 SDValue Rnd = Op.getOperand(5);
19267 if (!isRoundModeCurDirection(Rnd))
19268 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19269 dl, VT, Src1, Src2, Rnd),
19270 Mask, passThru, Subtarget, DAG);
19272 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19273 Mask, passThru, Subtarget, DAG);
19275 case INTR_TYPE_SCALAR_MASK_RM: {
19276 SDValue Src1 = Op.getOperand(1);
19277 SDValue Src2 = Op.getOperand(2);
19278 SDValue Src0 = Op.getOperand(3);
19279 SDValue Mask = Op.getOperand(4);
19280 // There are 2 kinds of intrinsics in this group:
19281 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19282 // (2) With rounding mode and sae - 7 operands.
19283 if (Op.getNumOperands() == 6) {
19284 SDValue Sae = Op.getOperand(5);
19285 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19287 Mask, Src0, Subtarget, DAG);
19289 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19290 SDValue RoundingMode = Op.getOperand(5);
19291 SDValue Sae = Op.getOperand(6);
19292 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19293 RoundingMode, Sae),
19294 Mask, Src0, Subtarget, DAG);
19296 case INTR_TYPE_2OP_MASK:
19297 case INTR_TYPE_2OP_IMM8_MASK: {
19298 SDValue Src1 = Op.getOperand(1);
19299 SDValue Src2 = Op.getOperand(2);
19300 SDValue PassThru = Op.getOperand(3);
19301 SDValue Mask = Op.getOperand(4);
19303 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19304 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19306 // We specify 2 possible opcodes for intrinsics with rounding modes.
19307 // First, we check if the intrinsic may have non-default rounding mode,
19308 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19309 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19310 if (IntrWithRoundingModeOpcode != 0) {
19311 SDValue Rnd = Op.getOperand(5);
19312 if (!isRoundModeCurDirection(Rnd)) {
19313 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19314 dl, Op.getValueType(),
19316 Mask, PassThru, Subtarget, DAG);
19319 // TODO: Intrinsics should have fast-math-flags to propagate.
19320 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19321 Mask, PassThru, Subtarget, DAG);
19323 case INTR_TYPE_2OP_MASK_RM: {
19324 SDValue Src1 = Op.getOperand(1);
19325 SDValue Src2 = Op.getOperand(2);
19326 SDValue PassThru = Op.getOperand(3);
19327 SDValue Mask = Op.getOperand(4);
19328 // We specify 2 possible modes for intrinsics, with/without rounding
19330 // First, we check if the intrinsic have rounding mode (6 operands),
19331 // if not, we set rounding mode to "current".
19333 if (Op.getNumOperands() == 6)
19334 Rnd = Op.getOperand(5);
19336 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19337 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19339 Mask, PassThru, Subtarget, DAG);
19341 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19342 SDValue Src1 = Op.getOperand(1);
19343 SDValue Src2 = Op.getOperand(2);
19344 SDValue Src3 = Op.getOperand(3);
19345 SDValue PassThru = Op.getOperand(4);
19346 SDValue Mask = Op.getOperand(5);
19347 SDValue Sae = Op.getOperand(6);
19349 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19351 Mask, PassThru, Subtarget, DAG);
19353 case INTR_TYPE_3OP_MASK_RM: {
19354 SDValue Src1 = Op.getOperand(1);
19355 SDValue Src2 = Op.getOperand(2);
19356 SDValue Imm = Op.getOperand(3);
19357 SDValue PassThru = Op.getOperand(4);
19358 SDValue Mask = Op.getOperand(5);
19359 // We specify 2 possible modes for intrinsics, with/without rounding
19361 // First, we check if the intrinsic have rounding mode (7 operands),
19362 // if not, we set rounding mode to "current".
19364 if (Op.getNumOperands() == 7)
19365 Rnd = Op.getOperand(6);
19367 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19368 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19369 Src1, Src2, Imm, Rnd),
19370 Mask, PassThru, Subtarget, DAG);
19372 case INTR_TYPE_3OP_IMM8_MASK:
19373 case INTR_TYPE_3OP_MASK: {
19374 SDValue Src1 = Op.getOperand(1);
19375 SDValue Src2 = Op.getOperand(2);
19376 SDValue Src3 = Op.getOperand(3);
19377 SDValue PassThru = Op.getOperand(4);
19378 SDValue Mask = Op.getOperand(5);
19380 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19381 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19383 // We specify 2 possible opcodes for intrinsics with rounding modes.
19384 // First, we check if the intrinsic may have non-default rounding mode,
19385 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19386 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19387 if (IntrWithRoundingModeOpcode != 0) {
19388 SDValue Rnd = Op.getOperand(6);
19389 if (!isRoundModeCurDirection(Rnd)) {
19390 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19391 dl, Op.getValueType(),
19392 Src1, Src2, Src3, Rnd),
19393 Mask, PassThru, Subtarget, DAG);
19396 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19398 Mask, PassThru, Subtarget, DAG);
19400 case VPERM_2OP_MASK : {
19401 SDValue Src1 = Op.getOperand(1);
19402 SDValue Src2 = Op.getOperand(2);
19403 SDValue PassThru = Op.getOperand(3);
19404 SDValue Mask = Op.getOperand(4);
19406 // Swap Src1 and Src2 in the node creation
19407 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19408 Mask, PassThru, Subtarget, DAG);
19410 case VPERM_3OP_MASKZ:
19411 case VPERM_3OP_MASK:{
19412 MVT VT = Op.getSimpleValueType();
19413 // Src2 is the PassThru
19414 SDValue Src1 = Op.getOperand(1);
19415 // PassThru needs to be the same type as the destination in order
19416 // to pattern match correctly.
19417 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19418 SDValue Src3 = Op.getOperand(3);
19419 SDValue Mask = Op.getOperand(4);
19420 SDValue PassThru = SDValue();
19422 // set PassThru element
19423 if (IntrData->Type == VPERM_3OP_MASKZ)
19424 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19428 // Swap Src1 and Src2 in the node creation
19429 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19430 dl, Op.getValueType(),
19432 Mask, PassThru, Subtarget, DAG);
19436 case FMA_OP_MASK: {
19437 SDValue Src1 = Op.getOperand(1);
19438 SDValue Src2 = Op.getOperand(2);
19439 SDValue Src3 = Op.getOperand(3);
19440 SDValue Mask = Op.getOperand(4);
19441 MVT VT = Op.getSimpleValueType();
19442 SDValue PassThru = SDValue();
19444 // set PassThru element
19445 if (IntrData->Type == FMA_OP_MASKZ)
19446 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19447 else if (IntrData->Type == FMA_OP_MASK3)
19452 // We specify 2 possible opcodes for intrinsics with rounding modes.
19453 // First, we check if the intrinsic may have non-default rounding mode,
19454 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19455 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19456 if (IntrWithRoundingModeOpcode != 0) {
19457 SDValue Rnd = Op.getOperand(5);
19458 if (!isRoundModeCurDirection(Rnd))
19459 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19460 dl, Op.getValueType(),
19461 Src1, Src2, Src3, Rnd),
19462 Mask, PassThru, Subtarget, DAG);
19464 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19465 dl, Op.getValueType(),
19467 Mask, PassThru, Subtarget, DAG);
19469 case FMA_OP_SCALAR_MASK:
19470 case FMA_OP_SCALAR_MASK3:
19471 case FMA_OP_SCALAR_MASKZ: {
19472 SDValue Src1 = Op.getOperand(1);
19473 SDValue Src2 = Op.getOperand(2);
19474 SDValue Src3 = Op.getOperand(3);
19475 SDValue Mask = Op.getOperand(4);
19476 MVT VT = Op.getSimpleValueType();
19477 SDValue PassThru = SDValue();
19479 // set PassThru element
19480 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19481 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19482 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19487 SDValue Rnd = Op.getOperand(5);
19488 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19489 Op.getValueType(), Src1, Src2,
19491 Mask, PassThru, Subtarget, DAG);
19493 case TERLOG_OP_MASK:
19494 case TERLOG_OP_MASKZ: {
19495 SDValue Src1 = Op.getOperand(1);
19496 SDValue Src2 = Op.getOperand(2);
19497 SDValue Src3 = Op.getOperand(3);
19498 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19499 SDValue Mask = Op.getOperand(5);
19500 MVT VT = Op.getSimpleValueType();
19501 SDValue PassThru = Src1;
19502 // Set PassThru element.
19503 if (IntrData->Type == TERLOG_OP_MASKZ)
19504 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19506 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19507 Src1, Src2, Src3, Src4),
19508 Mask, PassThru, Subtarget, DAG);
19511 // ISD::FP_ROUND has a second argument that indicates if the truncation
19512 // does not change the value. Set it to 0 since it can change.
19513 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19514 DAG.getIntPtrConstant(0, dl));
19515 case CVTPD2PS_MASK: {
19516 SDValue Src = Op.getOperand(1);
19517 SDValue PassThru = Op.getOperand(2);
19518 SDValue Mask = Op.getOperand(3);
19519 // We add rounding mode to the Node when
19520 // - RM Opcode is specified and
19521 // - RM is not "current direction".
19522 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19523 if (IntrWithRoundingModeOpcode != 0) {
19524 SDValue Rnd = Op.getOperand(4);
19525 if (!isRoundModeCurDirection(Rnd)) {
19526 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19527 dl, Op.getValueType(),
19529 Mask, PassThru, Subtarget, DAG);
19532 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19533 // ISD::FP_ROUND has a second argument that indicates if the truncation
19534 // does not change the value. Set it to 0 since it can change.
19535 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19536 DAG.getIntPtrConstant(0, dl)),
19537 Mask, PassThru, Subtarget, DAG);
19540 // FPclass intrinsics with mask
19541 SDValue Src1 = Op.getOperand(1);
19542 MVT VT = Src1.getSimpleValueType();
19543 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19544 SDValue Imm = Op.getOperand(2);
19545 SDValue Mask = Op.getOperand(3);
19546 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19547 Mask.getSimpleValueType().getSizeInBits());
19548 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19549 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19550 DAG.getTargetConstant(0, dl, MaskVT),
19552 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19553 DAG.getUNDEF(BitcastVT), FPclassMask,
19554 DAG.getIntPtrConstant(0, dl));
19555 return DAG.getBitcast(Op.getValueType(), Res);
19558 SDValue Src1 = Op.getOperand(1);
19559 SDValue Imm = Op.getOperand(2);
19560 SDValue Mask = Op.getOperand(3);
19561 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
19562 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19563 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19564 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
19567 case CMP_MASK_CC: {
19568 // Comparison intrinsics with masks.
19569 // Example of transformation:
19570 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19571 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19573 // (v8i1 (insert_subvector undef,
19574 // (v2i1 (and (PCMPEQM %a, %b),
19575 // (extract_subvector
19576 // (v8i1 (bitcast %mask)), 0))), 0))))
19577 MVT VT = Op.getOperand(1).getSimpleValueType();
19578 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19579 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19580 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19581 Mask.getSimpleValueType().getSizeInBits());
19583 if (IntrData->Type == CMP_MASK_CC) {
19584 SDValue CC = Op.getOperand(3);
19585 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19586 // We specify 2 possible opcodes for intrinsics with rounding modes.
19587 // First, we check if the intrinsic may have non-default rounding mode,
19588 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19589 if (IntrData->Opc1 != 0) {
19590 SDValue Rnd = Op.getOperand(5);
19591 if (!isRoundModeCurDirection(Rnd))
19592 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19593 Op.getOperand(2), CC, Rnd);
19595 //default rounding mode
19597 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19598 Op.getOperand(2), CC);
19601 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19602 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19605 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19606 DAG.getTargetConstant(0, dl,
19609 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19610 DAG.getUNDEF(BitcastVT), CmpMask,
19611 DAG.getIntPtrConstant(0, dl));
19612 return DAG.getBitcast(Op.getValueType(), Res);
19614 case CMP_MASK_SCALAR_CC: {
19615 SDValue Src1 = Op.getOperand(1);
19616 SDValue Src2 = Op.getOperand(2);
19617 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19618 SDValue Mask = Op.getOperand(4);
19621 if (IntrData->Opc1 != 0) {
19622 SDValue Rnd = Op.getOperand(5);
19623 if (!isRoundModeCurDirection(Rnd))
19624 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
19626 //default rounding mode
19628 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
19630 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19631 DAG.getTargetConstant(0, dl,
19635 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
19637 case COMI: { // Comparison intrinsics
19638 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19639 SDValue LHS = Op.getOperand(1);
19640 SDValue RHS = Op.getOperand(2);
19641 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19642 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19645 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19646 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19647 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19648 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19651 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19652 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19653 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19654 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19657 case ISD::SETGT: // (CF = 0 and ZF = 0)
19658 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19660 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19661 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19664 case ISD::SETGE: // CF = 0
19665 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19667 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19668 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19671 llvm_unreachable("Unexpected illegal condition!");
19673 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19675 case COMI_RM: { // Comparison intrinsics with Sae
19676 SDValue LHS = Op.getOperand(1);
19677 SDValue RHS = Op.getOperand(2);
19678 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19679 SDValue Sae = Op.getOperand(4);
19682 if (isRoundModeCurDirection(Sae))
19683 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19684 DAG.getConstant(CondVal, dl, MVT::i8));
19686 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19687 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19688 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19689 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19692 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19693 Op.getOperand(1), Op.getOperand(2), Subtarget,
19695 case COMPRESS_EXPAND_IN_REG: {
19696 SDValue Mask = Op.getOperand(3);
19697 SDValue DataToCompress = Op.getOperand(1);
19698 SDValue PassThru = Op.getOperand(2);
19699 if (isAllOnesConstant(Mask)) // return data as is
19700 return Op.getOperand(1);
19702 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19704 Mask, PassThru, Subtarget, DAG);
19707 SDValue Mask = Op.getOperand(1);
19708 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19709 Mask.getSimpleValueType().getSizeInBits());
19710 Mask = DAG.getBitcast(MaskVT, Mask);
19711 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19714 MVT VT = Op.getSimpleValueType();
19715 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19717 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19718 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19719 // Arguments should be swapped.
19720 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19721 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19723 return DAG.getBitcast(VT, Res);
19726 MVT VT = Op.getSimpleValueType();
19727 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19729 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19730 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19731 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19732 return DAG.getBitcast(VT, Res);
19735 case FIXUPIMMS_MASKZ:
19737 case FIXUPIMM_MASKZ:{
19738 SDValue Src1 = Op.getOperand(1);
19739 SDValue Src2 = Op.getOperand(2);
19740 SDValue Src3 = Op.getOperand(3);
19741 SDValue Imm = Op.getOperand(4);
19742 SDValue Mask = Op.getOperand(5);
19743 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19744 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19745 // We specify 2 possible modes for intrinsics, with/without rounding
19747 // First, we check if the intrinsic have rounding mode (7 operands),
19748 // if not, we set rounding mode to "current".
19750 if (Op.getNumOperands() == 7)
19751 Rnd = Op.getOperand(6);
19753 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19754 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19755 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19756 Src1, Src2, Src3, Imm, Rnd),
19757 Mask, Passthru, Subtarget, DAG);
19758 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19759 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19760 Src1, Src2, Src3, Imm, Rnd),
19761 Mask, Passthru, Subtarget, DAG);
19763 case CONVERT_TO_MASK: {
19764 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19765 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19766 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19768 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19770 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19771 DAG.getUNDEF(BitcastVT), CvtMask,
19772 DAG.getIntPtrConstant(0, dl));
19773 return DAG.getBitcast(Op.getValueType(), Res);
19775 case CONVERT_MASK_TO_VEC: {
19776 SDValue Mask = Op.getOperand(1);
19777 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19778 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19779 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19781 case BRCST_SUBVEC_TO_VEC: {
19782 SDValue Src = Op.getOperand(1);
19783 SDValue Passthru = Op.getOperand(2);
19784 SDValue Mask = Op.getOperand(3);
19785 EVT resVT = Passthru.getValueType();
19786 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19787 DAG.getUNDEF(resVT), Src,
19788 DAG.getIntPtrConstant(0, dl));
19790 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19791 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19793 immVal = DAG.getConstant(0, dl, MVT::i8);
19794 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19795 subVec, subVec, immVal),
19796 Mask, Passthru, Subtarget, DAG);
19798 case BRCST32x2_TO_VEC: {
19799 SDValue Src = Op.getOperand(1);
19800 SDValue PassThru = Op.getOperand(2);
19801 SDValue Mask = Op.getOperand(3);
19803 assert((VT.getScalarType() == MVT::i32 ||
19804 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19805 //bitcast Src to packed 64
19806 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19807 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19808 Src = DAG.getBitcast(BitcastVT, Src);
19810 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19811 Mask, PassThru, Subtarget, DAG);
19819 default: return SDValue(); // Don't custom lower most intrinsics.
19821 case Intrinsic::x86_avx2_permd:
19822 case Intrinsic::x86_avx2_permps:
19823 // Operands intentionally swapped. Mask is last operand to intrinsic,
19824 // but second operand for node/instruction.
19825 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19826 Op.getOperand(2), Op.getOperand(1));
19828 // ptest and testp intrinsics. The intrinsic these come from are designed to
19829 // return an integer value, not just an instruction so lower it to the ptest
19830 // or testp pattern and a setcc for the result.
19831 case Intrinsic::x86_sse41_ptestz:
19832 case Intrinsic::x86_sse41_ptestc:
19833 case Intrinsic::x86_sse41_ptestnzc:
19834 case Intrinsic::x86_avx_ptestz_256:
19835 case Intrinsic::x86_avx_ptestc_256:
19836 case Intrinsic::x86_avx_ptestnzc_256:
19837 case Intrinsic::x86_avx_vtestz_ps:
19838 case Intrinsic::x86_avx_vtestc_ps:
19839 case Intrinsic::x86_avx_vtestnzc_ps:
19840 case Intrinsic::x86_avx_vtestz_pd:
19841 case Intrinsic::x86_avx_vtestc_pd:
19842 case Intrinsic::x86_avx_vtestnzc_pd:
19843 case Intrinsic::x86_avx_vtestz_ps_256:
19844 case Intrinsic::x86_avx_vtestc_ps_256:
19845 case Intrinsic::x86_avx_vtestnzc_ps_256:
19846 case Intrinsic::x86_avx_vtestz_pd_256:
19847 case Intrinsic::x86_avx_vtestc_pd_256:
19848 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19849 bool IsTestPacked = false;
19850 X86::CondCode X86CC;
19852 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19853 case Intrinsic::x86_avx_vtestz_ps:
19854 case Intrinsic::x86_avx_vtestz_pd:
19855 case Intrinsic::x86_avx_vtestz_ps_256:
19856 case Intrinsic::x86_avx_vtestz_pd_256:
19857 IsTestPacked = true;
19859 case Intrinsic::x86_sse41_ptestz:
19860 case Intrinsic::x86_avx_ptestz_256:
19862 X86CC = X86::COND_E;
19864 case Intrinsic::x86_avx_vtestc_ps:
19865 case Intrinsic::x86_avx_vtestc_pd:
19866 case Intrinsic::x86_avx_vtestc_ps_256:
19867 case Intrinsic::x86_avx_vtestc_pd_256:
19868 IsTestPacked = true;
19870 case Intrinsic::x86_sse41_ptestc:
19871 case Intrinsic::x86_avx_ptestc_256:
19873 X86CC = X86::COND_B;
19875 case Intrinsic::x86_avx_vtestnzc_ps:
19876 case Intrinsic::x86_avx_vtestnzc_pd:
19877 case Intrinsic::x86_avx_vtestnzc_ps_256:
19878 case Intrinsic::x86_avx_vtestnzc_pd_256:
19879 IsTestPacked = true;
19881 case Intrinsic::x86_sse41_ptestnzc:
19882 case Intrinsic::x86_avx_ptestnzc_256:
19884 X86CC = X86::COND_A;
19888 SDValue LHS = Op.getOperand(1);
19889 SDValue RHS = Op.getOperand(2);
19890 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19891 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19892 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19893 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19895 case Intrinsic::x86_avx512_kortestz_w:
19896 case Intrinsic::x86_avx512_kortestc_w: {
19897 X86::CondCode X86CC =
19898 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19899 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19900 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19901 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19902 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19903 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19906 case Intrinsic::x86_avx512_knot_w: {
19907 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19908 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19909 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19910 return DAG.getBitcast(MVT::i16, Res);
19913 case Intrinsic::x86_avx512_kandn_w: {
19914 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19915 // Invert LHS for the not.
19916 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19917 DAG.getConstant(1, dl, MVT::v16i1));
19918 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19919 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19920 return DAG.getBitcast(MVT::i16, Res);
19923 case Intrinsic::x86_avx512_kxnor_w: {
19924 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19925 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19926 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19927 // Invert result for the not.
19928 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19929 DAG.getConstant(1, dl, MVT::v16i1));
19930 return DAG.getBitcast(MVT::i16, Res);
19933 case Intrinsic::x86_sse42_pcmpistria128:
19934 case Intrinsic::x86_sse42_pcmpestria128:
19935 case Intrinsic::x86_sse42_pcmpistric128:
19936 case Intrinsic::x86_sse42_pcmpestric128:
19937 case Intrinsic::x86_sse42_pcmpistrio128:
19938 case Intrinsic::x86_sse42_pcmpestrio128:
19939 case Intrinsic::x86_sse42_pcmpistris128:
19940 case Intrinsic::x86_sse42_pcmpestris128:
19941 case Intrinsic::x86_sse42_pcmpistriz128:
19942 case Intrinsic::x86_sse42_pcmpestriz128: {
19944 X86::CondCode X86CC;
19946 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19947 case Intrinsic::x86_sse42_pcmpistria128:
19948 Opcode = X86ISD::PCMPISTRI;
19949 X86CC = X86::COND_A;
19951 case Intrinsic::x86_sse42_pcmpestria128:
19952 Opcode = X86ISD::PCMPESTRI;
19953 X86CC = X86::COND_A;
19955 case Intrinsic::x86_sse42_pcmpistric128:
19956 Opcode = X86ISD::PCMPISTRI;
19957 X86CC = X86::COND_B;
19959 case Intrinsic::x86_sse42_pcmpestric128:
19960 Opcode = X86ISD::PCMPESTRI;
19961 X86CC = X86::COND_B;
19963 case Intrinsic::x86_sse42_pcmpistrio128:
19964 Opcode = X86ISD::PCMPISTRI;
19965 X86CC = X86::COND_O;
19967 case Intrinsic::x86_sse42_pcmpestrio128:
19968 Opcode = X86ISD::PCMPESTRI;
19969 X86CC = X86::COND_O;
19971 case Intrinsic::x86_sse42_pcmpistris128:
19972 Opcode = X86ISD::PCMPISTRI;
19973 X86CC = X86::COND_S;
19975 case Intrinsic::x86_sse42_pcmpestris128:
19976 Opcode = X86ISD::PCMPESTRI;
19977 X86CC = X86::COND_S;
19979 case Intrinsic::x86_sse42_pcmpistriz128:
19980 Opcode = X86ISD::PCMPISTRI;
19981 X86CC = X86::COND_E;
19983 case Intrinsic::x86_sse42_pcmpestriz128:
19984 Opcode = X86ISD::PCMPESTRI;
19985 X86CC = X86::COND_E;
19988 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19989 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19990 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19991 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19992 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19995 case Intrinsic::x86_sse42_pcmpistri128:
19996 case Intrinsic::x86_sse42_pcmpestri128: {
19998 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19999 Opcode = X86ISD::PCMPISTRI;
20001 Opcode = X86ISD::PCMPESTRI;
20003 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20004 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20005 return DAG.getNode(Opcode, dl, VTs, NewOps);
20008 case Intrinsic::eh_sjlj_lsda: {
20009 MachineFunction &MF = DAG.getMachineFunction();
20010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20011 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20012 auto &Context = MF.getMMI().getContext();
20013 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20014 Twine(MF.getFunctionNumber()));
20015 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20018 case Intrinsic::x86_seh_lsda: {
20019 // Compute the symbol for the LSDA. We know it'll get emitted later.
20020 MachineFunction &MF = DAG.getMachineFunction();
20021 SDValue Op1 = Op.getOperand(1);
20022 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20023 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20024 GlobalValue::getRealLinkageName(Fn->getName()));
20026 // Generate a simple absolute symbol reference. This intrinsic is only
20027 // supported on 32-bit Windows, which isn't PIC.
20028 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20029 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20032 case Intrinsic::x86_seh_recoverfp: {
20033 SDValue FnOp = Op.getOperand(1);
20034 SDValue IncomingFPOp = Op.getOperand(2);
20035 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20036 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20038 report_fatal_error(
20039 "llvm.x86.seh.recoverfp must take a function as the first argument");
20040 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20043 case Intrinsic::localaddress: {
20044 // Returns one of the stack, base, or frame pointer registers, depending on
20045 // which is used to reference local variables.
20046 MachineFunction &MF = DAG.getMachineFunction();
20047 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20049 if (RegInfo->hasBasePointer(MF))
20050 Reg = RegInfo->getBaseRegister();
20051 else // This function handles the SP or FP case.
20052 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20053 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20058 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20059 SDValue Src, SDValue Mask, SDValue Base,
20060 SDValue Index, SDValue ScaleOp, SDValue Chain,
20061 const X86Subtarget &Subtarget) {
20063 auto *C = cast<ConstantSDNode>(ScaleOp);
20064 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20065 EVT MaskVT = Mask.getValueType();
20066 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20067 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20068 SDValue Segment = DAG.getRegister(0, MVT::i32);
20069 // If source is undef or we know it won't be used, use a zero vector
20070 // to break register dependency.
20071 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20072 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20073 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20074 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20075 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20076 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20077 return DAG.getMergeValues(RetOps, dl);
20080 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20081 SDValue Src, SDValue Mask, SDValue Base,
20082 SDValue Index, SDValue ScaleOp, SDValue Chain,
20083 const X86Subtarget &Subtarget) {
20085 auto *C = cast<ConstantSDNode>(ScaleOp);
20086 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20087 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20088 Index.getSimpleValueType().getVectorNumElements());
20090 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20091 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20092 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20093 SDValue Segment = DAG.getRegister(0, MVT::i32);
20094 // If source is undef or we know it won't be used, use a zero vector
20095 // to break register dependency.
20096 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20097 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20098 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20099 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20100 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20101 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20102 return DAG.getMergeValues(RetOps, dl);
20105 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20106 SDValue Src, SDValue Mask, SDValue Base,
20107 SDValue Index, SDValue ScaleOp, SDValue Chain,
20108 const X86Subtarget &Subtarget) {
20110 auto *C = cast<ConstantSDNode>(ScaleOp);
20111 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20112 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20113 SDValue Segment = DAG.getRegister(0, MVT::i32);
20114 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20115 Index.getSimpleValueType().getVectorNumElements());
20117 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20118 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20119 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20120 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20121 return SDValue(Res, 1);
20124 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20125 SDValue Mask, SDValue Base, SDValue Index,
20126 SDValue ScaleOp, SDValue Chain,
20127 const X86Subtarget &Subtarget) {
20129 auto *C = cast<ConstantSDNode>(ScaleOp);
20130 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20131 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20132 SDValue Segment = DAG.getRegister(0, MVT::i32);
20134 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20135 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20136 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20137 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20138 return SDValue(Res, 0);
20141 /// Handles the lowering of builtin intrinsic that return the value
20142 /// of the extended control register.
20143 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20145 const X86Subtarget &Subtarget,
20146 SmallVectorImpl<SDValue> &Results) {
20147 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20148 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20151 // The ECX register is used to select the index of the XCR register to
20154 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20155 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20156 Chain = SDValue(N1, 0);
20158 // Reads the content of XCR and returns it in registers EDX:EAX.
20159 if (Subtarget.is64Bit()) {
20160 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20161 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20164 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20165 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20168 Chain = HI.getValue(1);
20170 if (Subtarget.is64Bit()) {
20171 // Merge the two 32-bit values into a 64-bit one..
20172 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20173 DAG.getConstant(32, DL, MVT::i8));
20174 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20175 Results.push_back(Chain);
20179 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20180 SDValue Ops[] = { LO, HI };
20181 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20182 Results.push_back(Pair);
20183 Results.push_back(Chain);
20186 /// Handles the lowering of builtin intrinsics that read performance monitor
20187 /// counters (x86_rdpmc).
20188 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20190 const X86Subtarget &Subtarget,
20191 SmallVectorImpl<SDValue> &Results) {
20192 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20193 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20196 // The ECX register is used to select the index of the performance counter
20198 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20200 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20202 // Reads the content of a 64-bit performance counter and returns it in the
20203 // registers EDX:EAX.
20204 if (Subtarget.is64Bit()) {
20205 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20206 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20209 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20210 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20213 Chain = HI.getValue(1);
20215 if (Subtarget.is64Bit()) {
20216 // The EAX register is loaded with the low-order 32 bits. The EDX register
20217 // is loaded with the supported high-order bits of the counter.
20218 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20219 DAG.getConstant(32, DL, MVT::i8));
20220 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20221 Results.push_back(Chain);
20225 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20226 SDValue Ops[] = { LO, HI };
20227 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20228 Results.push_back(Pair);
20229 Results.push_back(Chain);
20232 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20233 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20234 /// READCYCLECOUNTER nodes.
20235 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20237 const X86Subtarget &Subtarget,
20238 SmallVectorImpl<SDValue> &Results) {
20239 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20240 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20243 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20244 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20245 // and the EAX register is loaded with the low-order 32 bits.
20246 if (Subtarget.is64Bit()) {
20247 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20248 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20251 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20252 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20255 SDValue Chain = HI.getValue(1);
20257 if (Opcode == X86ISD::RDTSCP_DAG) {
20258 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20260 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20261 // the ECX register. Add 'ecx' explicitly to the chain.
20262 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20264 // Explicitly store the content of ECX at the location passed in input
20265 // to the 'rdtscp' intrinsic.
20266 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20267 MachinePointerInfo());
20270 if (Subtarget.is64Bit()) {
20271 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20272 // the EAX register is loaded with the low-order 32 bits.
20273 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20274 DAG.getConstant(32, DL, MVT::i8));
20275 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20276 Results.push_back(Chain);
20280 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20281 SDValue Ops[] = { LO, HI };
20282 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20283 Results.push_back(Pair);
20284 Results.push_back(Chain);
20287 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20288 SelectionDAG &DAG) {
20289 SmallVector<SDValue, 2> Results;
20291 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20293 return DAG.getMergeValues(Results, DL);
20296 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20297 MachineFunction &MF = DAG.getMachineFunction();
20298 SDValue Chain = Op.getOperand(0);
20299 SDValue RegNode = Op.getOperand(2);
20300 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20302 report_fatal_error("EH registrations only live in functions using WinEH");
20304 // Cast the operand to an alloca, and remember the frame index.
20305 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20307 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20308 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20310 // Return the chain operand without making any DAG nodes.
20314 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20315 MachineFunction &MF = DAG.getMachineFunction();
20316 SDValue Chain = Op.getOperand(0);
20317 SDValue EHGuard = Op.getOperand(2);
20318 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20320 report_fatal_error("EHGuard only live in functions using WinEH");
20322 // Cast the operand to an alloca, and remember the frame index.
20323 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20325 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20326 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20328 // Return the chain operand without making any DAG nodes.
20332 /// Emit Truncating Store with signed or unsigned saturation.
20334 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20335 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20336 SelectionDAG &DAG) {
20338 SDVTList VTs = DAG.getVTList(MVT::Other);
20339 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20340 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20342 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20343 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20346 /// Emit Masked Truncating Store with signed or unsigned saturation.
20348 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20349 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20350 MachineMemOperand *MMO, SelectionDAG &DAG) {
20352 SDVTList VTs = DAG.getVTList(MVT::Other);
20353 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20355 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20356 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20359 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20360 SelectionDAG &DAG) {
20361 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20363 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
20365 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
20366 return MarkEHRegistrationNode(Op, DAG);
20367 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
20368 return MarkEHGuard(Op, DAG);
20369 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
20370 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
20371 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
20372 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
20373 // We need a frame pointer because this will get lowered to a PUSH/POP
20375 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20376 MFI.setHasCopyImplyingStackAdjustment(true);
20377 // Don't do anything here, we will expand these intrinsics out later
20378 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20385 switch(IntrData->Type) {
20386 default: llvm_unreachable("Unknown Intrinsic Type");
20389 // Emit the node with the right value type.
20390 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20391 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20393 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20394 // Otherwise return the value from Rand, which is always 0, casted to i32.
20395 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20396 DAG.getConstant(1, dl, Op->getValueType(1)),
20397 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20398 SDValue(Result.getNode(), 1) };
20399 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20400 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20403 // Return { result, isValid, chain }.
20404 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20405 SDValue(Result.getNode(), 2));
20407 case GATHER_AVX2: {
20408 SDValue Chain = Op.getOperand(0);
20409 SDValue Src = Op.getOperand(2);
20410 SDValue Base = Op.getOperand(3);
20411 SDValue Index = Op.getOperand(4);
20412 SDValue Mask = Op.getOperand(5);
20413 SDValue Scale = Op.getOperand(6);
20414 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20415 Scale, Chain, Subtarget);
20418 //gather(v1, mask, index, base, scale);
20419 SDValue Chain = Op.getOperand(0);
20420 SDValue Src = Op.getOperand(2);
20421 SDValue Base = Op.getOperand(3);
20422 SDValue Index = Op.getOperand(4);
20423 SDValue Mask = Op.getOperand(5);
20424 SDValue Scale = Op.getOperand(6);
20425 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20429 //scatter(base, mask, index, v1, scale);
20430 SDValue Chain = Op.getOperand(0);
20431 SDValue Base = Op.getOperand(2);
20432 SDValue Mask = Op.getOperand(3);
20433 SDValue Index = Op.getOperand(4);
20434 SDValue Src = Op.getOperand(5);
20435 SDValue Scale = Op.getOperand(6);
20436 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20437 Scale, Chain, Subtarget);
20440 SDValue Hint = Op.getOperand(6);
20441 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20442 assert((HintVal == 2 || HintVal == 3) &&
20443 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20444 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20445 SDValue Chain = Op.getOperand(0);
20446 SDValue Mask = Op.getOperand(2);
20447 SDValue Index = Op.getOperand(3);
20448 SDValue Base = Op.getOperand(4);
20449 SDValue Scale = Op.getOperand(5);
20450 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20453 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20455 SmallVector<SDValue, 2> Results;
20456 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20458 return DAG.getMergeValues(Results, dl);
20460 // Read Performance Monitoring Counters.
20462 SmallVector<SDValue, 2> Results;
20463 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20464 return DAG.getMergeValues(Results, dl);
20466 // Get Extended Control Register.
20468 SmallVector<SDValue, 2> Results;
20469 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20470 return DAG.getMergeValues(Results, dl);
20472 // XTEST intrinsics.
20474 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20475 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20477 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20478 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20479 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20480 Ret, SDValue(InTrans.getNode(), 1));
20484 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20485 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20486 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20487 DAG.getConstant(-1, dl, MVT::i8));
20488 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20489 Op.getOperand(4), GenCF.getValue(1));
20490 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20491 Op.getOperand(5), MachinePointerInfo());
20492 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20493 SDValue Results[] = { SetCC, Store };
20494 return DAG.getMergeValues(Results, dl);
20496 case COMPRESS_TO_MEM: {
20497 SDValue Mask = Op.getOperand(4);
20498 SDValue DataToCompress = Op.getOperand(3);
20499 SDValue Addr = Op.getOperand(2);
20500 SDValue Chain = Op.getOperand(0);
20501 MVT VT = DataToCompress.getSimpleValueType();
20503 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20504 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20506 if (isAllOnesConstant(Mask)) // return just a store
20507 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20508 MemIntr->getMemOperand());
20510 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20511 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20513 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20514 MemIntr->getMemOperand(),
20515 false /* truncating */, true /* compressing */);
20517 case TRUNCATE_TO_MEM_VI8:
20518 case TRUNCATE_TO_MEM_VI16:
20519 case TRUNCATE_TO_MEM_VI32: {
20520 SDValue Mask = Op.getOperand(4);
20521 SDValue DataToTruncate = Op.getOperand(3);
20522 SDValue Addr = Op.getOperand(2);
20523 SDValue Chain = Op.getOperand(0);
20525 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20526 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20528 EVT MemVT = MemIntr->getMemoryVT();
20530 uint16_t TruncationOp = IntrData->Opc0;
20531 switch (TruncationOp) {
20532 case X86ISD::VTRUNC: {
20533 if (isAllOnesConstant(Mask)) // return just a truncate store
20534 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20535 MemIntr->getMemOperand());
20537 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20538 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20540 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20541 MemIntr->getMemOperand(), true /* truncating */);
20543 case X86ISD::VTRUNCUS:
20544 case X86ISD::VTRUNCS: {
20545 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20546 if (isAllOnesConstant(Mask))
20547 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20548 MemIntr->getMemOperand(), DAG);
20550 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20551 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20553 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20554 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20557 llvm_unreachable("Unsupported truncstore intrinsic");
20561 case EXPAND_FROM_MEM: {
20562 SDValue Mask = Op.getOperand(4);
20563 SDValue PassThru = Op.getOperand(3);
20564 SDValue Addr = Op.getOperand(2);
20565 SDValue Chain = Op.getOperand(0);
20566 MVT VT = Op.getSimpleValueType();
20568 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20569 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20571 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20572 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20573 if (X86::isZeroNode(Mask))
20574 return DAG.getUNDEF(VT);
20576 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20577 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20578 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20579 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20580 true /* expanding */);
20585 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20586 SelectionDAG &DAG) const {
20587 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20588 MFI.setReturnAddressIsTaken(true);
20590 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20593 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20595 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20598 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20599 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20600 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20601 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20602 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20603 MachinePointerInfo());
20606 // Just load the return address.
20607 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20608 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20609 MachinePointerInfo());
20612 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20613 SelectionDAG &DAG) const {
20614 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20615 return getReturnAddressFrameIndex(DAG);
20618 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20619 MachineFunction &MF = DAG.getMachineFunction();
20620 MachineFrameInfo &MFI = MF.getFrameInfo();
20621 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20622 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20623 EVT VT = Op.getValueType();
20625 MFI.setFrameAddressIsTaken(true);
20627 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20628 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20629 // is not possible to crawl up the stack without looking at the unwind codes
20631 int FrameAddrIndex = FuncInfo->getFAIndex();
20632 if (!FrameAddrIndex) {
20633 // Set up a frame object for the return address.
20634 unsigned SlotSize = RegInfo->getSlotSize();
20635 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20636 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20637 FuncInfo->setFAIndex(FrameAddrIndex);
20639 return DAG.getFrameIndex(FrameAddrIndex, VT);
20642 unsigned FrameReg =
20643 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20644 SDLoc dl(Op); // FIXME probably not meaningful
20645 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20646 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20647 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20648 "Invalid Frame Register!");
20649 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20651 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20652 MachinePointerInfo());
20656 // FIXME? Maybe this could be a TableGen attribute on some registers and
20657 // this table could be generated automatically from RegInfo.
20658 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20659 SelectionDAG &DAG) const {
20660 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20661 const MachineFunction &MF = DAG.getMachineFunction();
20663 unsigned Reg = StringSwitch<unsigned>(RegName)
20664 .Case("esp", X86::ESP)
20665 .Case("rsp", X86::RSP)
20666 .Case("ebp", X86::EBP)
20667 .Case("rbp", X86::RBP)
20670 if (Reg == X86::EBP || Reg == X86::RBP) {
20671 if (!TFI.hasFP(MF))
20672 report_fatal_error("register " + StringRef(RegName) +
20673 " is allocatable: function has no frame pointer");
20676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20677 unsigned FrameReg =
20678 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20679 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20680 "Invalid Frame Register!");
20688 report_fatal_error("Invalid register name global variable");
20691 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20692 SelectionDAG &DAG) const {
20693 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20694 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20697 unsigned X86TargetLowering::getExceptionPointerRegister(
20698 const Constant *PersonalityFn) const {
20699 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20700 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20702 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20705 unsigned X86TargetLowering::getExceptionSelectorRegister(
20706 const Constant *PersonalityFn) const {
20707 // Funclet personalities don't use selectors (the runtime does the selection).
20708 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20709 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20712 bool X86TargetLowering::needsFixedCatchObjects() const {
20713 return Subtarget.isTargetWin64();
20716 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20717 SDValue Chain = Op.getOperand(0);
20718 SDValue Offset = Op.getOperand(1);
20719 SDValue Handler = Op.getOperand(2);
20722 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20723 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20724 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20725 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20726 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20727 "Invalid Frame Register!");
20728 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20729 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20731 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20732 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20734 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20735 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20736 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20738 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20739 DAG.getRegister(StoreAddrReg, PtrVT));
20742 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20743 SelectionDAG &DAG) const {
20745 // If the subtarget is not 64bit, we may need the global base reg
20746 // after isel expand pseudo, i.e., after CGBR pass ran.
20747 // Therefore, ask for the GlobalBaseReg now, so that the pass
20748 // inserts the code for us in case we need it.
20749 // Otherwise, we will end up in a situation where we will
20750 // reference a virtual register that is not defined!
20751 if (!Subtarget.is64Bit()) {
20752 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20753 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20755 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20756 DAG.getVTList(MVT::i32, MVT::Other),
20757 Op.getOperand(0), Op.getOperand(1));
20760 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20761 SelectionDAG &DAG) const {
20763 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20764 Op.getOperand(0), Op.getOperand(1));
20767 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20768 SelectionDAG &DAG) const {
20770 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20774 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20775 return Op.getOperand(0);
20778 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20779 SelectionDAG &DAG) const {
20780 SDValue Root = Op.getOperand(0);
20781 SDValue Trmp = Op.getOperand(1); // trampoline
20782 SDValue FPtr = Op.getOperand(2); // nested function
20783 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20786 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20787 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20789 if (Subtarget.is64Bit()) {
20790 SDValue OutChains[6];
20792 // Large code-model.
20793 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20794 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20796 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20797 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20799 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20801 // Load the pointer to the nested function into R11.
20802 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20803 SDValue Addr = Trmp;
20804 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20805 Addr, MachinePointerInfo(TrmpAddr));
20807 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20808 DAG.getConstant(2, dl, MVT::i64));
20810 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20811 /* Alignment = */ 2);
20813 // Load the 'nest' parameter value into R10.
20814 // R10 is specified in X86CallingConv.td
20815 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20816 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20817 DAG.getConstant(10, dl, MVT::i64));
20818 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20819 Addr, MachinePointerInfo(TrmpAddr, 10));
20821 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20822 DAG.getConstant(12, dl, MVT::i64));
20824 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20825 /* Alignment = */ 2);
20827 // Jump to the nested function.
20828 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20829 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20830 DAG.getConstant(20, dl, MVT::i64));
20831 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20832 Addr, MachinePointerInfo(TrmpAddr, 20));
20834 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20835 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20836 DAG.getConstant(22, dl, MVT::i64));
20837 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20838 Addr, MachinePointerInfo(TrmpAddr, 22));
20840 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20842 const Function *Func =
20843 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20844 CallingConv::ID CC = Func->getCallingConv();
20849 llvm_unreachable("Unsupported calling convention");
20850 case CallingConv::C:
20851 case CallingConv::X86_StdCall: {
20852 // Pass 'nest' parameter in ECX.
20853 // Must be kept in sync with X86CallingConv.td
20854 NestReg = X86::ECX;
20856 // Check that ECX wasn't needed by an 'inreg' parameter.
20857 FunctionType *FTy = Func->getFunctionType();
20858 const AttributeList &Attrs = Func->getAttributes();
20860 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20861 unsigned InRegCount = 0;
20864 for (FunctionType::param_iterator I = FTy->param_begin(),
20865 E = FTy->param_end(); I != E; ++I, ++Idx)
20866 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20867 auto &DL = DAG.getDataLayout();
20868 // FIXME: should only count parameters that are lowered to integers.
20869 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20872 if (InRegCount > 2) {
20873 report_fatal_error("Nest register in use - reduce number of inreg"
20879 case CallingConv::X86_FastCall:
20880 case CallingConv::X86_ThisCall:
20881 case CallingConv::Fast:
20882 // Pass 'nest' parameter in EAX.
20883 // Must be kept in sync with X86CallingConv.td
20884 NestReg = X86::EAX;
20888 SDValue OutChains[4];
20889 SDValue Addr, Disp;
20891 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20892 DAG.getConstant(10, dl, MVT::i32));
20893 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20895 // This is storing the opcode for MOV32ri.
20896 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20897 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20899 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20900 Trmp, MachinePointerInfo(TrmpAddr));
20902 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20903 DAG.getConstant(1, dl, MVT::i32));
20905 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20906 /* Alignment = */ 1);
20908 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20909 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20910 DAG.getConstant(5, dl, MVT::i32));
20911 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20912 Addr, MachinePointerInfo(TrmpAddr, 5),
20913 /* Alignment = */ 1);
20915 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20916 DAG.getConstant(6, dl, MVT::i32));
20918 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20919 /* Alignment = */ 1);
20921 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20925 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20926 SelectionDAG &DAG) const {
20928 The rounding mode is in bits 11:10 of FPSR, and has the following
20930 00 Round to nearest
20935 FLT_ROUNDS, on the other hand, expects the following:
20942 To perform the conversion, we do:
20943 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20946 MachineFunction &MF = DAG.getMachineFunction();
20947 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20948 unsigned StackAlignment = TFI.getStackAlignment();
20949 MVT VT = Op.getSimpleValueType();
20952 // Save FP Control Word to stack slot
20953 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20954 SDValue StackSlot =
20955 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20957 MachineMemOperand *MMO =
20958 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20959 MachineMemOperand::MOStore, 2, 2);
20961 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20962 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20963 DAG.getVTList(MVT::Other),
20964 Ops, MVT::i16, MMO);
20966 // Load FP Control Word from stack slot
20968 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20970 // Transform as necessary
20972 DAG.getNode(ISD::SRL, DL, MVT::i16,
20973 DAG.getNode(ISD::AND, DL, MVT::i16,
20974 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20975 DAG.getConstant(11, DL, MVT::i8));
20977 DAG.getNode(ISD::SRL, DL, MVT::i16,
20978 DAG.getNode(ISD::AND, DL, MVT::i16,
20979 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20980 DAG.getConstant(9, DL, MVT::i8));
20983 DAG.getNode(ISD::AND, DL, MVT::i16,
20984 DAG.getNode(ISD::ADD, DL, MVT::i16,
20985 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20986 DAG.getConstant(1, DL, MVT::i16)),
20987 DAG.getConstant(3, DL, MVT::i16));
20989 return DAG.getNode((VT.getSizeInBits() < 16 ?
20990 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20993 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20995 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20996 // to 512-bit vector.
20997 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20998 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20999 // split the vector, perform operation on it's Lo a Hi part and
21000 // concatenate the results.
21001 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
21002 assert(Op.getOpcode() == ISD::CTLZ);
21004 MVT VT = Op.getSimpleValueType();
21005 MVT EltVT = VT.getVectorElementType();
21006 unsigned NumElems = VT.getVectorNumElements();
21008 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
21009 // Extend to 512 bit vector.
21010 assert((VT.is256BitVector() || VT.is128BitVector()) &&
21011 "Unsupported value type for operation");
21013 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
21014 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
21015 DAG.getUNDEF(NewVT),
21017 DAG.getIntPtrConstant(0, dl));
21018 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
21020 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
21021 DAG.getIntPtrConstant(0, dl));
21024 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21025 "Unsupported element type");
21027 if (16 < NumElems) {
21028 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21030 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
21031 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
21033 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
21034 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
21036 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
21039 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21041 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21042 "Unsupported value type for operation");
21044 // Use native supported vector instruction vplzcntd.
21045 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21046 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21047 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21048 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21050 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21053 // Lower CTLZ using a PSHUFB lookup table implementation.
21054 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21055 const X86Subtarget &Subtarget,
21056 SelectionDAG &DAG) {
21057 MVT VT = Op.getSimpleValueType();
21058 int NumElts = VT.getVectorNumElements();
21059 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21060 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21062 // Per-nibble leading zero PSHUFB lookup table.
21063 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21064 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21065 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21066 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21068 SmallVector<SDValue, 64> LUTVec;
21069 for (int i = 0; i < NumBytes; ++i)
21070 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21071 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21073 // Begin by bitcasting the input to byte vector, then split those bytes
21074 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21075 // If the hi input nibble is zero then we add both results together, otherwise
21076 // we just take the hi result (by masking the lo result to zero before the
21078 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21079 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21081 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21082 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21083 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21084 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21085 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21087 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21088 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21089 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21090 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21092 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21093 // of the current vector width in the same way we did for the nibbles.
21094 // If the upper half of the input element is zero then add the halves'
21095 // leading zero counts together, otherwise just use the upper half's.
21096 // Double the width of the result until we are at target width.
21097 while (CurrVT != VT) {
21098 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21099 int CurrNumElts = CurrVT.getVectorNumElements();
21100 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21101 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21102 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21104 // Check if the upper half of the input element is zero.
21105 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21106 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21107 HiZ = DAG.getBitcast(NextVT, HiZ);
21109 // Move the upper/lower halves to the lower bits as we'll be extending to
21110 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21112 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21113 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21114 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21115 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21116 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21123 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21124 const X86Subtarget &Subtarget,
21125 SelectionDAG &DAG) {
21126 MVT VT = Op.getSimpleValueType();
21127 SDValue Op0 = Op.getOperand(0);
21129 if (Subtarget.hasAVX512())
21130 return LowerVectorCTLZ_AVX512(Op, DAG);
21132 // Decompose 256-bit ops into smaller 128-bit ops.
21133 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21134 unsigned NumElems = VT.getVectorNumElements();
21136 // Extract each 128-bit vector, perform ctlz and concat the result.
21137 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21138 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21140 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21141 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
21142 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
21145 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21146 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21149 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21150 SelectionDAG &DAG) {
21151 MVT VT = Op.getSimpleValueType();
21153 unsigned NumBits = VT.getSizeInBits();
21155 unsigned Opc = Op.getOpcode();
21158 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21160 Op = Op.getOperand(0);
21161 if (VT == MVT::i8) {
21162 // Zero extend to i32 since there is not an i8 bsr.
21164 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21167 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21168 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21169 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21171 if (Opc == ISD::CTLZ) {
21172 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21175 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21176 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21179 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21182 // Finally xor with NumBits-1.
21183 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21184 DAG.getConstant(NumBits - 1, dl, OpVT));
21187 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21191 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21192 MVT VT = Op.getSimpleValueType();
21193 unsigned NumBits = VT.getScalarSizeInBits();
21196 if (VT.isVector()) {
21197 SDValue N0 = Op.getOperand(0);
21198 SDValue Zero = DAG.getConstant(0, dl, VT);
21200 // lsb(x) = (x & -x)
21201 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21202 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21204 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21205 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21206 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21207 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21208 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21211 // cttz(x) = ctpop(lsb - 1)
21212 SDValue One = DAG.getConstant(1, dl, VT);
21213 return DAG.getNode(ISD::CTPOP, dl, VT,
21214 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21217 assert(Op.getOpcode() == ISD::CTTZ &&
21218 "Only scalar CTTZ requires custom lowering");
21220 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21221 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21222 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21224 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21227 DAG.getConstant(NumBits, dl, VT),
21228 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21231 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21234 /// Break a 256-bit integer operation into two new 128-bit ones and then
21235 /// concatenate the result back.
21236 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21237 MVT VT = Op.getSimpleValueType();
21239 assert(VT.is256BitVector() && VT.isInteger() &&
21240 "Unsupported value type for operation");
21242 unsigned NumElems = VT.getVectorNumElements();
21245 // Extract the LHS vectors
21246 SDValue LHS = Op.getOperand(0);
21247 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21248 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21250 // Extract the RHS vectors
21251 SDValue RHS = Op.getOperand(1);
21252 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21253 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21255 MVT EltVT = VT.getVectorElementType();
21256 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21258 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21259 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21260 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21263 /// Break a 512-bit integer operation into two new 256-bit ones and then
21264 /// concatenate the result back.
21265 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21266 MVT VT = Op.getSimpleValueType();
21268 assert(VT.is512BitVector() && VT.isInteger() &&
21269 "Unsupported value type for operation");
21271 unsigned NumElems = VT.getVectorNumElements();
21274 // Extract the LHS vectors
21275 SDValue LHS = Op.getOperand(0);
21276 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21277 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21279 // Extract the RHS vectors
21280 SDValue RHS = Op.getOperand(1);
21281 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21282 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21284 MVT EltVT = VT.getVectorElementType();
21285 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21287 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21288 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21289 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21292 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21293 MVT VT = Op.getSimpleValueType();
21294 if (VT.getScalarType() == MVT::i1)
21295 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21296 Op.getOperand(0), Op.getOperand(1));
21297 assert(Op.getSimpleValueType().is256BitVector() &&
21298 Op.getSimpleValueType().isInteger() &&
21299 "Only handle AVX 256-bit vector integer operation");
21300 return Lower256IntArith(Op, DAG);
21303 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21304 assert(Op.getSimpleValueType().is256BitVector() &&
21305 Op.getSimpleValueType().isInteger() &&
21306 "Only handle AVX 256-bit vector integer operation");
21307 MVT VT = Op.getSimpleValueType();
21308 unsigned NumElems = VT.getVectorNumElements();
21311 SDValue Src = Op.getOperand(0);
21312 SDValue Lo = extract128BitVector(Src, 0, DAG, dl);
21313 SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl);
21315 MVT EltVT = VT.getVectorElementType();
21316 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21317 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21318 DAG.getNode(ISD::ABS, dl, NewVT, Lo),
21319 DAG.getNode(ISD::ABS, dl, NewVT, Hi));
21322 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21323 assert(Op.getSimpleValueType().is256BitVector() &&
21324 Op.getSimpleValueType().isInteger() &&
21325 "Only handle AVX 256-bit vector integer operation");
21326 return Lower256IntArith(Op, DAG);
21329 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21330 SelectionDAG &DAG) {
21332 MVT VT = Op.getSimpleValueType();
21334 if (VT.getScalarType() == MVT::i1)
21335 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21337 // Decompose 256-bit ops into smaller 128-bit ops.
21338 if (VT.is256BitVector() && !Subtarget.hasInt256())
21339 return Lower256IntArith(Op, DAG);
21341 SDValue A = Op.getOperand(0);
21342 SDValue B = Op.getOperand(1);
21344 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21345 // vector pairs, multiply and truncate.
21346 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21347 if (Subtarget.hasInt256()) {
21348 // For 512-bit vectors, split into 256-bit vectors to allow the
21349 // sign-extension to occur.
21350 if (VT == MVT::v64i8)
21351 return Lower512IntArith(Op, DAG);
21353 // For 256-bit vectors, split into 128-bit vectors to allow the
21354 // sign-extension to occur. We don't need this on AVX512BW as we can
21355 // safely sign-extend to v32i16.
21356 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21357 return Lower256IntArith(Op, DAG);
21359 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21360 return DAG.getNode(
21361 ISD::TRUNCATE, dl, VT,
21362 DAG.getNode(ISD::MUL, dl, ExVT,
21363 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21364 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21367 assert(VT == MVT::v16i8 &&
21368 "Pre-AVX2 support only supports v16i8 multiplication");
21369 MVT ExVT = MVT::v8i16;
21371 // Extract the lo parts and sign extend to i16
21373 if (Subtarget.hasSSE41()) {
21374 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21375 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21377 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21378 -1, 4, -1, 5, -1, 6, -1, 7};
21379 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21380 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21381 ALo = DAG.getBitcast(ExVT, ALo);
21382 BLo = DAG.getBitcast(ExVT, BLo);
21383 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21384 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21387 // Extract the hi parts and sign extend to i16
21389 if (Subtarget.hasSSE41()) {
21390 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21391 -1, -1, -1, -1, -1, -1, -1, -1};
21392 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21393 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21394 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21395 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21397 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21398 -1, 12, -1, 13, -1, 14, -1, 15};
21399 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21400 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21401 AHi = DAG.getBitcast(ExVT, AHi);
21402 BHi = DAG.getBitcast(ExVT, BHi);
21403 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21404 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21407 // Multiply, mask the lower 8bits of the lo/hi results and pack
21408 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21409 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21410 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21411 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21412 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21415 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21416 if (VT == MVT::v4i32) {
21417 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21418 "Should not custom lower when pmuldq is available!");
21420 // Extract the odd parts.
21421 static const int UnpackMask[] = { 1, -1, 3, -1 };
21422 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21423 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21425 // Multiply the even parts.
21426 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21427 // Now multiply odd parts.
21428 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21430 Evens = DAG.getBitcast(VT, Evens);
21431 Odds = DAG.getBitcast(VT, Odds);
21433 // Merge the two vectors back together with a shuffle. This expands into 2
21435 static const int ShufMask[] = { 0, 4, 2, 6 };
21436 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21439 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21440 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21442 // 32-bit vector types used for MULDQ/MULUDQ.
21443 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21445 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21446 // 32-bits. We can lower with this if the sign bits stretch that far.
21447 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21448 DAG.ComputeNumSignBits(B) > 32) {
21449 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21450 DAG.getBitcast(MulVT, B));
21453 // Ahi = psrlqi(a, 32);
21454 // Bhi = psrlqi(b, 32);
21456 // AloBlo = pmuludq(a, b);
21457 // AloBhi = pmuludq(a, Bhi);
21458 // AhiBlo = pmuludq(Ahi, b);
21460 // Hi = psllqi(AloBhi + AhiBlo, 32);
21461 // return AloBlo + Hi;
21462 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21463 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21464 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21466 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21467 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21468 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21470 // Bit cast to 32-bit vectors for MULUDQ.
21471 SDValue Alo = DAG.getBitcast(MulVT, A);
21472 SDValue Blo = DAG.getBitcast(MulVT, B);
21474 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21476 // Only multiply lo/hi halves that aren't known to be zero.
21477 SDValue AloBlo = Zero;
21478 if (!ALoIsZero && !BLoIsZero)
21479 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21481 SDValue AloBhi = Zero;
21482 if (!ALoIsZero && !BHiIsZero) {
21483 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21484 Bhi = DAG.getBitcast(MulVT, Bhi);
21485 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21488 SDValue AhiBlo = Zero;
21489 if (!AHiIsZero && !BLoIsZero) {
21490 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21491 Ahi = DAG.getBitcast(MulVT, Ahi);
21492 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21495 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21496 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21498 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21501 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21502 SelectionDAG &DAG) {
21504 MVT VT = Op.getSimpleValueType();
21506 // Decompose 256-bit ops into smaller 128-bit ops.
21507 if (VT.is256BitVector() && !Subtarget.hasInt256())
21508 return Lower256IntArith(Op, DAG);
21510 // Only i8 vectors should need custom lowering after this.
21511 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21512 "Unsupported vector type");
21514 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21515 // logical shift down the upper half and pack back to i8.
21516 SDValue A = Op.getOperand(0);
21517 SDValue B = Op.getOperand(1);
21519 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21520 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21521 unsigned Opcode = Op.getOpcode();
21522 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21523 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21525 // AVX2 implementations - extend xmm subvectors to ymm.
21526 if (Subtarget.hasInt256()) {
21527 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21528 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21530 if (VT == MVT::v32i8) {
21531 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21532 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21533 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21534 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21535 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21536 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21537 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21538 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21539 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21540 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21541 DAG.getConstant(8, dl, MVT::v16i16));
21542 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21543 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21544 DAG.getConstant(8, dl, MVT::v16i16));
21545 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21546 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21547 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21548 16, 17, 18, 19, 20, 21, 22, 23};
21549 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21550 24, 25, 26, 27, 28, 29, 30, 31};
21551 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21552 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21553 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21556 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21557 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21558 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21559 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21560 DAG.getConstant(8, dl, MVT::v16i16));
21561 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21562 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21563 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21566 assert(VT == MVT::v16i8 &&
21567 "Pre-AVX2 support only supports v16i8 multiplication");
21568 MVT ExVT = MVT::v8i16;
21570 // Extract the lo parts and zero/sign extend to i16.
21572 if (Subtarget.hasSSE41()) {
21573 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21574 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21576 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21577 -1, 4, -1, 5, -1, 6, -1, 7};
21578 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21579 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21580 ALo = DAG.getBitcast(ExVT, ALo);
21581 BLo = DAG.getBitcast(ExVT, BLo);
21582 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21583 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21586 // Extract the hi parts and zero/sign extend to i16.
21588 if (Subtarget.hasSSE41()) {
21589 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21590 -1, -1, -1, -1, -1, -1, -1, -1};
21591 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21592 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21593 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21594 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21596 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21597 -1, 12, -1, 13, -1, 14, -1, 15};
21598 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21599 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21600 AHi = DAG.getBitcast(ExVT, AHi);
21601 BHi = DAG.getBitcast(ExVT, BHi);
21602 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21603 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21606 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21607 // pack back to v16i8.
21608 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21609 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21610 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21611 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21612 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21615 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21616 assert(Subtarget.isTargetWin64() && "Unexpected target");
21617 EVT VT = Op.getValueType();
21618 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21619 "Unexpected return type for lowering");
21623 switch (Op->getOpcode()) {
21624 default: llvm_unreachable("Unexpected request for libcall!");
21625 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21626 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21627 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21628 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21629 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21630 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21634 SDValue InChain = DAG.getEntryNode();
21636 TargetLowering::ArgListTy Args;
21637 TargetLowering::ArgListEntry Entry;
21638 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21639 EVT ArgVT = Op->getOperand(i).getValueType();
21640 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21641 "Unexpected argument type for lowering");
21642 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21643 Entry.Node = StackPtr;
21644 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21645 MachinePointerInfo(), /* Alignment = */ 16);
21646 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21647 Entry.Ty = PointerType::get(ArgTy,0);
21648 Entry.IsSExt = false;
21649 Entry.IsZExt = false;
21650 Args.push_back(Entry);
21653 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21654 getPointerTy(DAG.getDataLayout()));
21656 TargetLowering::CallLoweringInfo CLI(DAG);
21657 CLI.setDebugLoc(dl)
21660 getLibcallCallingConv(LC),
21661 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21664 .setSExtResult(isSigned)
21665 .setZExtResult(!isSigned);
21667 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21668 return DAG.getBitcast(VT, CallInfo.first);
21671 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21672 SelectionDAG &DAG) {
21673 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21674 MVT VT = Op0.getSimpleValueType();
21677 // Decompose 256-bit ops into smaller 128-bit ops.
21678 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21679 unsigned Opcode = Op.getOpcode();
21680 unsigned NumElems = VT.getVectorNumElements();
21681 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21682 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21683 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21684 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21685 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21686 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21687 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21689 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21690 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21692 return DAG.getMergeValues(Ops, dl);
21695 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21696 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21698 // PMULxD operations multiply each even value (starting at 0) of LHS with
21699 // the related value of RHS and produce a widen result.
21700 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21701 // => <2 x i64> <ae|cg>
21703 // In other word, to have all the results, we need to perform two PMULxD:
21704 // 1. one with the even values.
21705 // 2. one with the odd values.
21706 // To achieve #2, with need to place the odd values at an even position.
21708 // Place the odd value at an even position (basically, shift all values 1
21709 // step to the left):
21710 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21711 // <a|b|c|d> => <b|undef|d|undef>
21712 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21713 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21714 // <e|f|g|h> => <f|undef|h|undef>
21715 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21716 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21718 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21720 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21721 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21723 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21724 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21725 // => <2 x i64> <ae|cg>
21726 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21727 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21728 // => <2 x i64> <bf|dh>
21729 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21731 // Shuffle it back into the right order.
21732 SDValue Highs, Lows;
21733 if (VT == MVT::v8i32) {
21734 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21735 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21736 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21737 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21739 const int HighMask[] = {1, 5, 3, 7};
21740 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21741 const int LowMask[] = {0, 4, 2, 6};
21742 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21745 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21746 // unsigned multiply.
21747 if (IsSigned && !Subtarget.hasSSE41()) {
21748 SDValue ShAmt = DAG.getConstant(
21750 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21751 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21752 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21753 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21754 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21756 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21757 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21760 // The first result of MUL_LOHI is actually the low value, followed by the
21762 SDValue Ops[] = {Lows, Highs};
21763 return DAG.getMergeValues(Ops, dl);
21766 // Return true if the required (according to Opcode) shift-imm form is natively
21767 // supported by the Subtarget
21768 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21770 if (VT.getScalarSizeInBits() < 16)
21773 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21774 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21777 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21778 (VT.is256BitVector() && Subtarget.hasInt256());
21780 bool AShift = LShift && (Subtarget.hasAVX512() ||
21781 (VT != MVT::v2i64 && VT != MVT::v4i64));
21782 return (Opcode == ISD::SRA) ? AShift : LShift;
21785 // The shift amount is a variable, but it is the same for all vector lanes.
21786 // These instructions are defined together with shift-immediate.
21788 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21790 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21793 // Return true if the required (according to Opcode) variable-shift form is
21794 // natively supported by the Subtarget
21795 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21798 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21801 // vXi16 supported only on AVX-512, BWI
21802 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21805 if (Subtarget.hasAVX512())
21808 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21809 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21810 return (Opcode == ISD::SRA) ? AShift : LShift;
21813 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21814 const X86Subtarget &Subtarget) {
21815 MVT VT = Op.getSimpleValueType();
21817 SDValue R = Op.getOperand(0);
21818 SDValue Amt = Op.getOperand(1);
21820 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21821 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21823 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21824 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21825 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21826 SDValue Ex = DAG.getBitcast(ExVT, R);
21828 if (ShiftAmt >= 32) {
21829 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21831 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21832 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21833 ShiftAmt - 32, DAG);
21834 if (VT == MVT::v2i64)
21835 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21836 if (VT == MVT::v4i64)
21837 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21838 {9, 1, 11, 3, 13, 5, 15, 7});
21840 // SRA upper i32, SHL whole i64 and select lower i32.
21841 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21844 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21845 Lower = DAG.getBitcast(ExVT, Lower);
21846 if (VT == MVT::v2i64)
21847 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21848 if (VT == MVT::v4i64)
21849 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21850 {8, 1, 10, 3, 12, 5, 14, 7});
21852 return DAG.getBitcast(VT, Ex);
21855 // Optimize shl/srl/sra with constant shift amount.
21856 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21857 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21858 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21860 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21861 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21863 // i64 SRA needs to be performed as partial shifts.
21864 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21865 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21866 return ArithmeticShiftRight64(ShiftAmt);
21868 if (VT == MVT::v16i8 ||
21869 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21870 VT == MVT::v64i8) {
21871 unsigned NumElts = VT.getVectorNumElements();
21872 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21874 // Simple i8 add case
21875 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21876 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21878 // ashr(R, 7) === cmp_slt(R, 0)
21879 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21880 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21881 if (VT.is512BitVector()) {
21882 assert(VT == MVT::v64i8 && "Unexpected element type!");
21883 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21884 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21886 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21889 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21890 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21893 if (Op.getOpcode() == ISD::SHL) {
21894 // Make a large shift.
21895 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21897 SHL = DAG.getBitcast(VT, SHL);
21898 // Zero out the rightmost bits.
21899 return DAG.getNode(ISD::AND, dl, VT, SHL,
21900 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21902 if (Op.getOpcode() == ISD::SRL) {
21903 // Make a large shift.
21904 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21906 SRL = DAG.getBitcast(VT, SRL);
21907 // Zero out the leftmost bits.
21908 return DAG.getNode(ISD::AND, dl, VT, SRL,
21909 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21911 if (Op.getOpcode() == ISD::SRA) {
21912 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21913 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21915 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21916 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21917 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21920 llvm_unreachable("Unknown shift opcode.");
21925 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21926 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21927 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21928 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21930 // Peek through any splat that was introduced for i64 shift vectorization.
21931 int SplatIndex = -1;
21932 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21933 if (SVN->isSplat()) {
21934 SplatIndex = SVN->getSplatIndex();
21935 Amt = Amt.getOperand(0);
21936 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21937 "Splat shuffle referencing second operand");
21940 if (Amt.getOpcode() != ISD::BITCAST ||
21941 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21944 Amt = Amt.getOperand(0);
21945 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21946 VT.getVectorNumElements();
21947 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21948 uint64_t ShiftAmt = 0;
21949 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21950 for (unsigned i = 0; i != Ratio; ++i) {
21951 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21955 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21958 // Check remaining shift amounts (if not a splat).
21959 if (SplatIndex < 0) {
21960 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21961 uint64_t ShAmt = 0;
21962 for (unsigned j = 0; j != Ratio; ++j) {
21963 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21967 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21969 if (ShAmt != ShiftAmt)
21974 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21975 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21977 if (Op.getOpcode() == ISD::SRA)
21978 return ArithmeticShiftRight64(ShiftAmt);
21984 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21985 const X86Subtarget &Subtarget) {
21986 MVT VT = Op.getSimpleValueType();
21988 SDValue R = Op.getOperand(0);
21989 SDValue Amt = Op.getOperand(1);
21991 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21992 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21994 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21995 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21997 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21999 MVT EltVT = VT.getVectorElementType();
22001 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22002 // Check if this build_vector node is doing a splat.
22003 // If so, then set BaseShAmt equal to the splat value.
22004 BaseShAmt = BV->getSplatValue();
22005 if (BaseShAmt && BaseShAmt.isUndef())
22006 BaseShAmt = SDValue();
22008 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22009 Amt = Amt.getOperand(0);
22011 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22012 if (SVN && SVN->isSplat()) {
22013 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22014 SDValue InVec = Amt.getOperand(0);
22015 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22016 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22017 "Unexpected shuffle index found!");
22018 BaseShAmt = InVec.getOperand(SplatIdx);
22019 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22020 if (ConstantSDNode *C =
22021 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22022 if (C->getZExtValue() == SplatIdx)
22023 BaseShAmt = InVec.getOperand(1);
22028 // Avoid introducing an extract element from a shuffle.
22029 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22030 DAG.getIntPtrConstant(SplatIdx, dl));
22034 if (BaseShAmt.getNode()) {
22035 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22036 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22037 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22038 else if (EltVT.bitsLT(MVT::i32))
22039 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22041 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22045 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22046 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22047 Amt.getOpcode() == ISD::BITCAST &&
22048 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22049 Amt = Amt.getOperand(0);
22050 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22051 VT.getVectorNumElements();
22052 std::vector<SDValue> Vals(Ratio);
22053 for (unsigned i = 0; i != Ratio; ++i)
22054 Vals[i] = Amt.getOperand(i);
22055 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22056 for (unsigned j = 0; j != Ratio; ++j)
22057 if (Vals[j] != Amt.getOperand(i + j))
22061 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22062 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22067 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22068 SelectionDAG &DAG) {
22069 MVT VT = Op.getSimpleValueType();
22071 SDValue R = Op.getOperand(0);
22072 SDValue Amt = Op.getOperand(1);
22073 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22075 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22076 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22078 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22081 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22084 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22087 // XOP has 128-bit variable logical/arithmetic shifts.
22088 // +ve/-ve Amt = shift left/right.
22089 if (Subtarget.hasXOP() &&
22090 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22091 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22092 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22093 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22094 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22096 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22097 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22098 if (Op.getOpcode() == ISD::SRA)
22099 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22102 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22103 // shifts per-lane and then shuffle the partial results back together.
22104 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22105 // Splat the shift amounts so the scalar shifts above will catch it.
22106 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22107 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22108 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22109 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22110 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22113 // i64 vector arithmetic shift can be emulated with the transform:
22114 // M = lshr(SIGN_MASK, Amt)
22115 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22116 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22117 Op.getOpcode() == ISD::SRA) {
22118 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22119 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22120 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22121 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22122 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22126 // If possible, lower this packed shift into a vector multiply instead of
22127 // expanding it into a sequence of scalar shifts.
22128 // Do this only if the vector shift count is a constant build_vector.
22129 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22130 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22131 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22132 SmallVector<SDValue, 8> Elts;
22133 MVT SVT = VT.getVectorElementType();
22134 unsigned SVTBits = SVT.getSizeInBits();
22135 APInt One(SVTBits, 1);
22136 unsigned NumElems = VT.getVectorNumElements();
22138 for (unsigned i=0; i !=NumElems; ++i) {
22139 SDValue Op = Amt->getOperand(i);
22140 if (Op->isUndef()) {
22141 Elts.push_back(Op);
22145 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22146 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22147 uint64_t ShAmt = C.getZExtValue();
22148 if (ShAmt >= SVTBits) {
22149 Elts.push_back(DAG.getUNDEF(SVT));
22152 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22154 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22155 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22158 // Lower SHL with variable shift amount.
22159 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22160 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22162 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22163 DAG.getConstant(0x3f800000U, dl, VT));
22164 Op = DAG.getBitcast(MVT::v4f32, Op);
22165 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22166 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22169 // If possible, lower this shift as a sequence of two shifts by
22170 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22172 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22174 // Could be rewritten as:
22175 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22177 // The advantage is that the two shifts from the example would be
22178 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22179 // the vector shift into four scalar shifts plus four pairs of vector
22181 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22182 unsigned TargetOpcode = X86ISD::MOVSS;
22183 bool CanBeSimplified;
22184 // The splat value for the first packed shift (the 'X' from the example).
22185 SDValue Amt1 = Amt->getOperand(0);
22186 // The splat value for the second packed shift (the 'Y' from the example).
22187 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22189 // See if it is possible to replace this node with a sequence of
22190 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22191 if (VT == MVT::v4i32) {
22192 // Check if it is legal to use a MOVSS.
22193 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22194 Amt2 == Amt->getOperand(3);
22195 if (!CanBeSimplified) {
22196 // Otherwise, check if we can still simplify this node using a MOVSD.
22197 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22198 Amt->getOperand(2) == Amt->getOperand(3);
22199 TargetOpcode = X86ISD::MOVSD;
22200 Amt2 = Amt->getOperand(2);
22203 // Do similar checks for the case where the machine value type
22205 CanBeSimplified = Amt1 == Amt->getOperand(1);
22206 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22207 CanBeSimplified = Amt2 == Amt->getOperand(i);
22209 if (!CanBeSimplified) {
22210 TargetOpcode = X86ISD::MOVSD;
22211 CanBeSimplified = true;
22212 Amt2 = Amt->getOperand(4);
22213 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22214 CanBeSimplified = Amt1 == Amt->getOperand(i);
22215 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22216 CanBeSimplified = Amt2 == Amt->getOperand(j);
22220 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22221 isa<ConstantSDNode>(Amt2)) {
22222 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22223 MVT CastVT = MVT::v4i32;
22225 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22226 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22228 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22229 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22230 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22231 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22232 if (TargetOpcode == X86ISD::MOVSD)
22233 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22234 BitCast2, {0, 1, 6, 7}));
22235 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22236 BitCast2, {0, 5, 6, 7}));
22240 // v4i32 Non Uniform Shifts.
22241 // If the shift amount is constant we can shift each lane using the SSE2
22242 // immediate shifts, else we need to zero-extend each lane to the lower i64
22243 // and shift using the SSE2 variable shifts.
22244 // The separate results can then be blended together.
22245 if (VT == MVT::v4i32) {
22246 unsigned Opc = Op.getOpcode();
22247 SDValue Amt0, Amt1, Amt2, Amt3;
22249 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22250 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22251 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22252 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22254 // ISD::SHL is handled above but we include it here for completeness.
22257 llvm_unreachable("Unknown target vector shift node");
22259 Opc = X86ISD::VSHL;
22262 Opc = X86ISD::VSRL;
22265 Opc = X86ISD::VSRA;
22268 // The SSE2 shifts use the lower i64 as the same shift amount for
22269 // all lanes and the upper i64 is ignored. These shuffle masks
22270 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22271 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22272 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22273 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22274 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22275 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22278 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22279 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22280 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22281 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22282 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22283 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22284 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22287 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22288 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22289 // make the existing SSE solution better.
22290 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22291 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22292 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22293 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22294 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22295 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22297 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22298 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22299 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22300 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22301 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22304 if (VT == MVT::v16i8 ||
22305 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22306 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22307 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22308 unsigned ShiftOpcode = Op->getOpcode();
22310 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22311 if (VT.is512BitVector()) {
22312 // On AVX512BW targets we make use of the fact that VSELECT lowers
22313 // to a masked blend which selects bytes based just on the sign bit
22314 // extracted to a mask.
22315 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22316 V0 = DAG.getBitcast(VT, V0);
22317 V1 = DAG.getBitcast(VT, V1);
22318 Sel = DAG.getBitcast(VT, Sel);
22319 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22320 return DAG.getBitcast(SelVT,
22321 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22322 } else if (Subtarget.hasSSE41()) {
22323 // On SSE41 targets we make use of the fact that VSELECT lowers
22324 // to PBLENDVB which selects bytes based just on the sign bit.
22325 V0 = DAG.getBitcast(VT, V0);
22326 V1 = DAG.getBitcast(VT, V1);
22327 Sel = DAG.getBitcast(VT, Sel);
22328 return DAG.getBitcast(SelVT,
22329 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22331 // On pre-SSE41 targets we test for the sign bit by comparing to
22332 // zero - a negative value will set all bits of the lanes to true
22333 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22334 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22335 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22336 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
22339 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22340 // We can safely do this using i16 shifts as we're only interested in
22341 // the 3 lower bits of each byte.
22342 Amt = DAG.getBitcast(ExtVT, Amt);
22343 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22344 Amt = DAG.getBitcast(VT, Amt);
22346 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22347 // r = VSELECT(r, shift(r, 4), a);
22349 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22350 R = SignBitSelect(VT, Amt, M, R);
22353 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22355 // r = VSELECT(r, shift(r, 2), a);
22356 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22357 R = SignBitSelect(VT, Amt, M, R);
22360 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22362 // return VSELECT(r, shift(r, 1), a);
22363 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22364 R = SignBitSelect(VT, Amt, M, R);
22368 if (Op->getOpcode() == ISD::SRA) {
22369 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22370 // so we can correctly sign extend. We don't care what happens to the
22372 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22373 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22374 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22375 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22376 ALo = DAG.getBitcast(ExtVT, ALo);
22377 AHi = DAG.getBitcast(ExtVT, AHi);
22378 RLo = DAG.getBitcast(ExtVT, RLo);
22379 RHi = DAG.getBitcast(ExtVT, RHi);
22381 // r = VSELECT(r, shift(r, 4), a);
22382 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22383 DAG.getConstant(4, dl, ExtVT));
22384 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22385 DAG.getConstant(4, dl, ExtVT));
22386 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22387 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22390 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22391 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22393 // r = VSELECT(r, shift(r, 2), a);
22394 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22395 DAG.getConstant(2, dl, ExtVT));
22396 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22397 DAG.getConstant(2, dl, ExtVT));
22398 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22399 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22402 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22403 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22405 // r = VSELECT(r, shift(r, 1), a);
22406 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22407 DAG.getConstant(1, dl, ExtVT));
22408 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22409 DAG.getConstant(1, dl, ExtVT));
22410 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22411 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22413 // Logical shift the result back to the lower byte, leaving a zero upper
22415 // meaning that we can safely pack with PACKUSWB.
22417 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22419 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22420 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22424 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22425 MVT ExtVT = MVT::v8i32;
22426 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22427 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22428 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22429 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22430 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22431 ALo = DAG.getBitcast(ExtVT, ALo);
22432 AHi = DAG.getBitcast(ExtVT, AHi);
22433 RLo = DAG.getBitcast(ExtVT, RLo);
22434 RHi = DAG.getBitcast(ExtVT, RHi);
22435 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22436 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22437 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22438 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22439 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22442 if (VT == MVT::v8i16) {
22443 unsigned ShiftOpcode = Op->getOpcode();
22445 // If we have a constant shift amount, the non-SSE41 path is best as
22446 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22447 bool UseSSE41 = Subtarget.hasSSE41() &&
22448 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22450 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22451 // On SSE41 targets we make use of the fact that VSELECT lowers
22452 // to PBLENDVB which selects bytes based just on the sign bit.
22454 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22455 V0 = DAG.getBitcast(ExtVT, V0);
22456 V1 = DAG.getBitcast(ExtVT, V1);
22457 Sel = DAG.getBitcast(ExtVT, Sel);
22458 return DAG.getBitcast(
22459 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
22461 // On pre-SSE41 targets we splat the sign bit - a negative value will
22462 // set all bits of the lanes to true and VSELECT uses that in
22463 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22465 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22466 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
22469 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22471 // On SSE41 targets we need to replicate the shift mask in both
22472 // bytes for PBLENDVB.
22475 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22476 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22478 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22481 // r = VSELECT(r, shift(r, 8), a);
22482 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22483 R = SignBitSelect(Amt, M, R);
22486 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22488 // r = VSELECT(r, shift(r, 4), a);
22489 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22490 R = SignBitSelect(Amt, M, R);
22493 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22495 // r = VSELECT(r, shift(r, 2), a);
22496 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22497 R = SignBitSelect(Amt, M, R);
22500 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22502 // return VSELECT(r, shift(r, 1), a);
22503 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22504 R = SignBitSelect(Amt, M, R);
22508 // Decompose 256-bit shifts into smaller 128-bit shifts.
22509 if (VT.is256BitVector())
22510 return Lower256IntArith(Op, DAG);
22515 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22516 SelectionDAG &DAG) {
22517 MVT VT = Op.getSimpleValueType();
22519 SDValue R = Op.getOperand(0);
22520 SDValue Amt = Op.getOperand(1);
22522 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22523 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22524 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22526 // XOP has 128-bit vector variable + immediate rotates.
22527 // +ve/-ve Amt = rotate left/right.
22529 // Split 256-bit integers.
22530 if (VT.is256BitVector())
22531 return Lower256IntArith(Op, DAG);
22533 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22535 // Attempt to rotate by immediate.
22536 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22537 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22538 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22539 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22540 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22541 DAG.getConstant(RotateAmt, DL, MVT::i8));
22545 // Use general rotate by variable (per-element).
22546 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22549 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22550 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22551 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22552 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22553 // has only one use.
22554 SDNode *N = Op.getNode();
22555 SDValue LHS = N->getOperand(0);
22556 SDValue RHS = N->getOperand(1);
22557 unsigned BaseOp = 0;
22558 X86::CondCode Cond;
22560 switch (Op.getOpcode()) {
22561 default: llvm_unreachable("Unknown ovf instruction!");
22563 // A subtract of one will be selected as a INC. Note that INC doesn't
22564 // set CF, so we can't do this for UADDO.
22565 if (isOneConstant(RHS)) {
22566 BaseOp = X86ISD::INC;
22567 Cond = X86::COND_O;
22570 BaseOp = X86ISD::ADD;
22571 Cond = X86::COND_O;
22574 BaseOp = X86ISD::ADD;
22575 Cond = X86::COND_B;
22578 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22579 // set CF, so we can't do this for USUBO.
22580 if (isOneConstant(RHS)) {
22581 BaseOp = X86ISD::DEC;
22582 Cond = X86::COND_O;
22585 BaseOp = X86ISD::SUB;
22586 Cond = X86::COND_O;
22589 BaseOp = X86ISD::SUB;
22590 Cond = X86::COND_B;
22593 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22594 Cond = X86::COND_O;
22596 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22597 if (N->getValueType(0) == MVT::i8) {
22598 BaseOp = X86ISD::UMUL8;
22599 Cond = X86::COND_O;
22602 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22604 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22606 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22608 if (N->getValueType(1) == MVT::i1)
22609 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22611 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22615 // Also sets EFLAGS.
22616 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22617 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22619 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22621 if (N->getValueType(1) == MVT::i1)
22622 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22624 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22627 /// Returns true if the operand type is exactly twice the native width, and
22628 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22629 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22630 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22631 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22632 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22635 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22636 else if (OpWidth == 128)
22637 return Subtarget.hasCmpxchg16b();
22642 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22643 return needsCmpXchgNb(SI->getValueOperand()->getType());
22646 // Note: this turns large loads into lock cmpxchg8b/16b.
22647 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22648 TargetLowering::AtomicExpansionKind
22649 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22650 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22651 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22652 : AtomicExpansionKind::None;
22655 TargetLowering::AtomicExpansionKind
22656 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22657 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22658 Type *MemType = AI->getType();
22660 // If the operand is too big, we must see if cmpxchg8/16b is available
22661 // and default to library calls otherwise.
22662 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22663 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22664 : AtomicExpansionKind::None;
22667 AtomicRMWInst::BinOp Op = AI->getOperation();
22670 llvm_unreachable("Unknown atomic operation");
22671 case AtomicRMWInst::Xchg:
22672 case AtomicRMWInst::Add:
22673 case AtomicRMWInst::Sub:
22674 // It's better to use xadd, xsub or xchg for these in all cases.
22675 return AtomicExpansionKind::None;
22676 case AtomicRMWInst::Or:
22677 case AtomicRMWInst::And:
22678 case AtomicRMWInst::Xor:
22679 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22680 // prefix to a normal instruction for these operations.
22681 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22682 : AtomicExpansionKind::None;
22683 case AtomicRMWInst::Nand:
22684 case AtomicRMWInst::Max:
22685 case AtomicRMWInst::Min:
22686 case AtomicRMWInst::UMax:
22687 case AtomicRMWInst::UMin:
22688 // These always require a non-trivial set of data operations on x86. We must
22689 // use a cmpxchg loop.
22690 return AtomicExpansionKind::CmpXChg;
22695 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22696 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22697 Type *MemType = AI->getType();
22698 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22699 // there is no benefit in turning such RMWs into loads, and it is actually
22700 // harmful as it introduces a mfence.
22701 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22704 auto Builder = IRBuilder<>(AI);
22705 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22706 auto SynchScope = AI->getSynchScope();
22707 // We must restrict the ordering to avoid generating loads with Release or
22708 // ReleaseAcquire orderings.
22709 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22710 auto Ptr = AI->getPointerOperand();
22712 // Before the load we need a fence. Here is an example lifted from
22713 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22716 // x.store(1, relaxed);
22717 // r1 = y.fetch_add(0, release);
22719 // y.fetch_add(42, acquire);
22720 // r2 = x.load(relaxed);
22721 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22722 // lowered to just a load without a fence. A mfence flushes the store buffer,
22723 // making the optimization clearly correct.
22724 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22725 // otherwise, we might be able to be more aggressive on relaxed idempotent
22726 // rmw. In practice, they do not look useful, so we don't try to be
22727 // especially clever.
22728 if (SynchScope == SingleThread)
22729 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22730 // the IR level, so we must wrap it in an intrinsic.
22733 if (!Subtarget.hasMFence())
22734 // FIXME: it might make sense to use a locked operation here but on a
22735 // different cache-line to prevent cache-line bouncing. In practice it
22736 // is probably a small win, and x86 processors without mfence are rare
22737 // enough that we do not bother.
22741 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22742 Builder.CreateCall(MFence, {});
22744 // Finally we can emit the atomic load.
22745 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22746 AI->getType()->getPrimitiveSizeInBits());
22747 Loaded->setAtomic(Order, SynchScope);
22748 AI->replaceAllUsesWith(Loaded);
22749 AI->eraseFromParent();
22753 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22754 SelectionDAG &DAG) {
22756 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22757 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22758 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22759 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22761 // The only fence that needs an instruction is a sequentially-consistent
22762 // cross-thread fence.
22763 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22764 FenceScope == CrossThread) {
22765 if (Subtarget.hasMFence())
22766 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22768 SDValue Chain = Op.getOperand(0);
22769 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22771 DAG.getRegister(X86::ESP, MVT::i32), // Base
22772 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22773 DAG.getRegister(0, MVT::i32), // Index
22774 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22775 DAG.getRegister(0, MVT::i32), // Segment.
22779 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22780 return SDValue(Res, 0);
22783 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22784 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22787 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22788 SelectionDAG &DAG) {
22789 MVT T = Op.getSimpleValueType();
22793 switch(T.SimpleTy) {
22794 default: llvm_unreachable("Invalid value type!");
22795 case MVT::i8: Reg = X86::AL; size = 1; break;
22796 case MVT::i16: Reg = X86::AX; size = 2; break;
22797 case MVT::i32: Reg = X86::EAX; size = 4; break;
22799 assert(Subtarget.is64Bit() && "Node not type legal!");
22800 Reg = X86::RAX; size = 8;
22803 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22804 Op.getOperand(2), SDValue());
22805 SDValue Ops[] = { cpIn.getValue(0),
22808 DAG.getTargetConstant(size, DL, MVT::i8),
22809 cpIn.getValue(1) };
22810 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22811 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22812 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22816 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22817 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22818 MVT::i32, cpOut.getValue(2));
22819 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22821 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22822 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22823 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22827 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22828 SelectionDAG &DAG) {
22829 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22830 MVT DstVT = Op.getSimpleValueType();
22832 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22833 SrcVT == MVT::i64) {
22834 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22835 if (DstVT != MVT::f64)
22836 // This conversion needs to be expanded.
22839 SDValue Op0 = Op->getOperand(0);
22840 SmallVector<SDValue, 16> Elts;
22844 if (SrcVT.isVector()) {
22845 NumElts = SrcVT.getVectorNumElements();
22846 SVT = SrcVT.getVectorElementType();
22848 // Widen the vector in input in the case of MVT::v2i32.
22849 // Example: from MVT::v2i32 to MVT::v4i32.
22850 for (unsigned i = 0, e = NumElts; i != e; ++i)
22851 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22852 DAG.getIntPtrConstant(i, dl)));
22854 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22855 "Unexpected source type in LowerBITCAST");
22856 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22857 DAG.getIntPtrConstant(0, dl)));
22858 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22859 DAG.getIntPtrConstant(1, dl)));
22863 // Explicitly mark the extra elements as Undef.
22864 Elts.append(NumElts, DAG.getUNDEF(SVT));
22866 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22867 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22868 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22869 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22870 DAG.getIntPtrConstant(0, dl));
22873 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22874 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22875 assert((DstVT == MVT::i64 ||
22876 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22877 "Unexpected custom BITCAST");
22878 // i64 <=> MMX conversions are Legal.
22879 if (SrcVT==MVT::i64 && DstVT.isVector())
22881 if (DstVT==MVT::i64 && SrcVT.isVector())
22883 // MMX <=> MMX conversions are Legal.
22884 if (SrcVT.isVector() && DstVT.isVector())
22886 // All other conversions need to be expanded.
22890 /// Compute the horizontal sum of bytes in V for the elements of VT.
22892 /// Requires V to be a byte vector and VT to be an integer vector type with
22893 /// wider elements than V's type. The width of the elements of VT determines
22894 /// how many bytes of V are summed horizontally to produce each element of the
22896 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22897 const X86Subtarget &Subtarget,
22898 SelectionDAG &DAG) {
22900 MVT ByteVecVT = V.getSimpleValueType();
22901 MVT EltVT = VT.getVectorElementType();
22902 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22903 "Expected value to have byte element type.");
22904 assert(EltVT != MVT::i8 &&
22905 "Horizontal byte sum only makes sense for wider elements!");
22906 unsigned VecSize = VT.getSizeInBits();
22907 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22909 // PSADBW instruction horizontally add all bytes and leave the result in i64
22910 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22911 if (EltVT == MVT::i64) {
22912 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22913 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22914 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22915 return DAG.getBitcast(VT, V);
22918 if (EltVT == MVT::i32) {
22919 // We unpack the low half and high half into i32s interleaved with zeros so
22920 // that we can use PSADBW to horizontally sum them. The most useful part of
22921 // this is that it lines up the results of two PSADBW instructions to be
22922 // two v2i64 vectors which concatenated are the 4 population counts. We can
22923 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22924 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22925 SDValue V32 = DAG.getBitcast(VT, V);
22926 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22927 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22929 // Do the horizontal sums into two v2i64s.
22930 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22931 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22932 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22933 DAG.getBitcast(ByteVecVT, Low), Zeros);
22934 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22935 DAG.getBitcast(ByteVecVT, High), Zeros);
22937 // Merge them together.
22938 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22939 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22940 DAG.getBitcast(ShortVecVT, Low),
22941 DAG.getBitcast(ShortVecVT, High));
22943 return DAG.getBitcast(VT, V);
22946 // The only element type left is i16.
22947 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22949 // To obtain pop count for each i16 element starting from the pop count for
22950 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22951 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22952 // directly supported.
22953 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22954 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22955 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22956 DAG.getBitcast(ByteVecVT, V));
22957 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22960 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22961 const X86Subtarget &Subtarget,
22962 SelectionDAG &DAG) {
22963 MVT VT = Op.getSimpleValueType();
22964 MVT EltVT = VT.getVectorElementType();
22965 unsigned VecSize = VT.getSizeInBits();
22967 // Implement a lookup table in register by using an algorithm based on:
22968 // http://wm.ite.pl/articles/sse-popcount.html
22970 // The general idea is that every lower byte nibble in the input vector is an
22971 // index into a in-register pre-computed pop count table. We then split up the
22972 // input vector in two new ones: (1) a vector with only the shifted-right
22973 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22974 // masked out higher ones) for each byte. PSHUFB is used separately with both
22975 // to index the in-register table. Next, both are added and the result is a
22976 // i8 vector where each element contains the pop count for input byte.
22978 // To obtain the pop count for elements != i8, we follow up with the same
22979 // approach and use additional tricks as described below.
22981 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22982 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22983 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22984 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22986 int NumByteElts = VecSize / 8;
22987 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22988 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22989 SmallVector<SDValue, 64> LUTVec;
22990 for (int i = 0; i < NumByteElts; ++i)
22991 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22992 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22993 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22996 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22997 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23000 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23002 // The input vector is used as the shuffle mask that index elements into the
23003 // LUT. After counting low and high nibbles, add the vector to obtain the
23004 // final pop count per i8 element.
23005 SDValue HighPopCnt =
23006 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23007 SDValue LowPopCnt =
23008 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23009 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23011 if (EltVT == MVT::i8)
23014 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23017 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23018 const X86Subtarget &Subtarget,
23019 SelectionDAG &DAG) {
23020 MVT VT = Op.getSimpleValueType();
23021 assert(VT.is128BitVector() &&
23022 "Only 128-bit vector bitmath lowering supported.");
23024 int VecSize = VT.getSizeInBits();
23025 MVT EltVT = VT.getVectorElementType();
23026 int Len = EltVT.getSizeInBits();
23028 // This is the vectorized version of the "best" algorithm from
23029 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23030 // with a minor tweak to use a series of adds + shifts instead of vector
23031 // multiplications. Implemented for all integer vector types. We only use
23032 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23033 // much faster, even faster than using native popcnt instructions.
23035 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23036 MVT VT = V.getSimpleValueType();
23037 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23038 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23040 auto GetMask = [&](SDValue V, APInt Mask) {
23041 MVT VT = V.getSimpleValueType();
23042 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23043 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23046 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23047 // x86, so set the SRL type to have elements at least i16 wide. This is
23048 // correct because all of our SRLs are followed immediately by a mask anyways
23049 // that handles any bits that sneak into the high bits of the byte elements.
23050 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23054 // v = v - ((v >> 1) & 0x55555555...)
23056 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23057 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23058 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23060 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23061 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23062 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23063 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23064 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23066 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23067 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23068 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23069 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23071 // At this point, V contains the byte-wise population count, and we are
23072 // merely doing a horizontal sum if necessary to get the wider element
23074 if (EltVT == MVT::i8)
23077 return LowerHorizontalByteSum(
23078 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23082 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23083 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23084 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23085 SelectionDAG &DAG) {
23086 MVT VT = Op.getSimpleValueType();
23087 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23088 "Unknown CTPOP type to handle");
23089 SDLoc DL(Op.getNode());
23090 SDValue Op0 = Op.getOperand(0);
23092 if (!Subtarget.hasSSSE3()) {
23093 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23094 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23095 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23098 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
23099 unsigned NumElems = VT.getVectorNumElements();
23101 // Extract each 128-bit vector, compute pop count and concat the result.
23102 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
23103 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
23105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23106 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
23107 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
23110 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
23111 unsigned NumElems = VT.getVectorNumElements();
23113 // Extract each 256-bit vector, compute pop count and concat the result.
23114 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
23115 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
23117 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23118 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
23119 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
23122 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23125 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23126 SelectionDAG &DAG) {
23127 assert(Op.getSimpleValueType().isVector() &&
23128 "We only do custom lowering for vector population count.");
23129 return LowerVectorCTPOP(Op, Subtarget, DAG);
23132 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23133 MVT VT = Op.getSimpleValueType();
23134 SDValue In = Op.getOperand(0);
23137 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23138 // perform the BITREVERSE.
23139 if (!VT.isVector()) {
23140 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23141 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23142 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23144 DAG.getIntPtrConstant(0, DL));
23147 MVT SVT = VT.getVectorElementType();
23148 int NumElts = VT.getVectorNumElements();
23149 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23151 // Decompose 256-bit ops into smaller 128-bit ops.
23152 if (VT.is256BitVector()) {
23153 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
23154 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
23156 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
23157 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23158 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
23159 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
23162 assert(VT.is128BitVector() &&
23163 "Only 128-bit vector bitreverse lowering supported.");
23165 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23166 // perform the BSWAP in the shuffle.
23167 // Its best to shuffle using the second operand as this will implicitly allow
23168 // memory folding for multiple vectors.
23169 SmallVector<SDValue, 16> MaskElts;
23170 for (int i = 0; i != NumElts; ++i) {
23171 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23172 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23173 int PermuteByte = SourceByte | (2 << 5);
23174 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23178 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23179 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23180 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23182 return DAG.getBitcast(VT, Res);
23185 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23186 SelectionDAG &DAG) {
23187 if (Subtarget.hasXOP())
23188 return LowerBITREVERSE_XOP(Op, DAG);
23190 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23192 MVT VT = Op.getSimpleValueType();
23193 SDValue In = Op.getOperand(0);
23196 unsigned NumElts = VT.getVectorNumElements();
23197 assert(VT.getScalarType() == MVT::i8 &&
23198 "Only byte vector BITREVERSE supported");
23200 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23201 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
23202 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
23203 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
23204 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
23205 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
23206 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
23207 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
23210 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23211 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23212 // 0-15 value (moved to the other nibble).
23213 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23214 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23215 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23217 const int LoLUT[16] = {
23218 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23219 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23220 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23221 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23222 const int HiLUT[16] = {
23223 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23224 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23225 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23226 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23228 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23229 for (unsigned i = 0; i < NumElts; ++i) {
23230 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23231 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23234 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23235 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23236 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23237 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23238 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23241 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23242 unsigned NewOpc = 0;
23243 switch (N->getOpcode()) {
23244 case ISD::ATOMIC_LOAD_ADD:
23245 NewOpc = X86ISD::LADD;
23247 case ISD::ATOMIC_LOAD_SUB:
23248 NewOpc = X86ISD::LSUB;
23250 case ISD::ATOMIC_LOAD_OR:
23251 NewOpc = X86ISD::LOR;
23253 case ISD::ATOMIC_LOAD_XOR:
23254 NewOpc = X86ISD::LXOR;
23256 case ISD::ATOMIC_LOAD_AND:
23257 NewOpc = X86ISD::LAND;
23260 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23263 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23264 return DAG.getMemIntrinsicNode(
23265 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23266 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23267 /*MemVT=*/N->getSimpleValueType(0), MMO);
23270 /// Lower atomic_load_ops into LOCK-prefixed operations.
23271 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23272 const X86Subtarget &Subtarget) {
23273 SDValue Chain = N->getOperand(0);
23274 SDValue LHS = N->getOperand(1);
23275 SDValue RHS = N->getOperand(2);
23276 unsigned Opc = N->getOpcode();
23277 MVT VT = N->getSimpleValueType(0);
23280 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23281 // can only be lowered when the result is unused. They should have already
23282 // been transformed into a cmpxchg loop in AtomicExpand.
23283 if (N->hasAnyUseOfValue(0)) {
23284 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23285 // select LXADD if LOCK_SUB can't be selected.
23286 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23287 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23288 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23289 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23290 RHS, AN->getMemOperand());
23292 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23293 "Used AtomicRMW ops other than Add should have been expanded!");
23297 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23298 // RAUW the chain, but don't worry about the result, as it's unused.
23299 assert(!N->hasAnyUseOfValue(0));
23300 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23304 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23305 SDNode *Node = Op.getNode();
23307 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23309 // Convert seq_cst store -> xchg
23310 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23311 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23312 // (The only way to get a 16-byte store is cmpxchg16b)
23313 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23314 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23315 AtomicOrdering::SequentiallyConsistent ||
23316 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23317 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23318 cast<AtomicSDNode>(Node)->getMemoryVT(),
23319 Node->getOperand(0),
23320 Node->getOperand(1), Node->getOperand(2),
23321 cast<AtomicSDNode>(Node)->getMemOperand());
23322 return Swap.getValue(1);
23324 // Other atomic stores have a simple pattern.
23328 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
23329 MVT VT = Op.getNode()->getSimpleValueType(0);
23331 // Let legalize expand this if it isn't a legal type yet.
23332 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23335 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23338 bool ExtraOp = false;
23339 switch (Op.getOpcode()) {
23340 default: llvm_unreachable("Invalid code");
23341 case ISD::ADDC: Opc = X86ISD::ADD; break;
23342 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
23343 case ISD::SUBC: Opc = X86ISD::SUB; break;
23344 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
23348 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23350 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23351 Op.getOperand(1), Op.getOperand(2));
23354 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23355 SelectionDAG &DAG) {
23356 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23358 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23359 // which returns the values as { float, float } (in XMM0) or
23360 // { double, double } (which is returned in XMM0, XMM1).
23362 SDValue Arg = Op.getOperand(0);
23363 EVT ArgVT = Arg.getValueType();
23364 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23366 TargetLowering::ArgListTy Args;
23367 TargetLowering::ArgListEntry Entry;
23371 Entry.IsSExt = false;
23372 Entry.IsZExt = false;
23373 Args.push_back(Entry);
23375 bool isF64 = ArgVT == MVT::f64;
23376 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23377 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23378 // the results are returned via SRet in memory.
23379 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23382 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23384 Type *RetTy = isF64
23385 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
23386 : (Type*)VectorType::get(ArgTy, 4);
23388 TargetLowering::CallLoweringInfo CLI(DAG);
23389 CLI.setDebugLoc(dl)
23390 .setChain(DAG.getEntryNode())
23391 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23393 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23396 // Returned in xmm0 and xmm1.
23397 return CallResult.first;
23399 // Returned in bits 0:31 and 32:64 xmm0.
23400 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23401 CallResult.first, DAG.getIntPtrConstant(0, dl));
23402 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23403 CallResult.first, DAG.getIntPtrConstant(1, dl));
23404 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23405 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23408 /// Widen a vector input to a vector of NVT. The
23409 /// input vector must have the same element type as NVT.
23410 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23411 bool FillWithZeroes = false) {
23412 // Check if InOp already has the right width.
23413 MVT InVT = InOp.getSimpleValueType();
23417 if (InOp.isUndef())
23418 return DAG.getUNDEF(NVT);
23420 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23421 "input and widen element type must match");
23423 unsigned InNumElts = InVT.getVectorNumElements();
23424 unsigned WidenNumElts = NVT.getVectorNumElements();
23425 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23426 "Unexpected request for vector widening");
23428 EVT EltVT = NVT.getVectorElementType();
23431 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23432 InOp.getNumOperands() == 2) {
23433 SDValue N1 = InOp.getOperand(1);
23434 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23436 InOp = InOp.getOperand(0);
23437 InVT = InOp.getSimpleValueType();
23438 InNumElts = InVT.getVectorNumElements();
23441 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23442 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23443 SmallVector<SDValue, 16> Ops;
23444 for (unsigned i = 0; i < InNumElts; ++i)
23445 Ops.push_back(InOp.getOperand(i));
23447 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23448 DAG.getUNDEF(EltVT);
23449 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23450 Ops.push_back(FillVal);
23451 return DAG.getBuildVector(NVT, dl, Ops);
23453 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23455 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23456 InOp, DAG.getIntPtrConstant(0, dl));
23459 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23460 SelectionDAG &DAG) {
23461 assert(Subtarget.hasAVX512() &&
23462 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23464 // X86 scatter kills mask register, so its type should be added to
23465 // the list of return values.
23466 // If the "scatter" has 2 return values, it is already handled.
23467 if (Op.getNode()->getNumValues() == 2)
23470 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23471 SDValue Src = N->getValue();
23472 MVT VT = Src.getSimpleValueType();
23473 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23476 SDValue NewScatter;
23477 SDValue Index = N->getIndex();
23478 SDValue Mask = N->getMask();
23479 SDValue Chain = N->getChain();
23480 SDValue BasePtr = N->getBasePtr();
23481 MVT MemVT = N->getMemoryVT().getSimpleVT();
23482 MVT IndexVT = Index.getSimpleValueType();
23483 MVT MaskVT = Mask.getSimpleValueType();
23485 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23486 // The v2i32 value was promoted to v2i64.
23487 // Now we "redo" the type legalizer's work and widen the original
23488 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23490 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23491 "Unexpected memory type");
23492 int ShuffleMask[] = {0, 2, -1, -1};
23493 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23494 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23495 // Now we have 4 elements instead of 2.
23496 // Expand the index.
23497 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23498 Index = ExtendToType(Index, NewIndexVT, DAG);
23500 // Expand the mask with zeroes
23501 // Mask may be <2 x i64> or <2 x i1> at this moment
23502 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23503 "Unexpected mask type");
23504 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23505 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23509 unsigned NumElts = VT.getVectorNumElements();
23510 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23511 !Index.getSimpleValueType().is512BitVector()) {
23512 // AVX512F supports only 512-bit vectors. Or data or index should
23513 // be 512 bit wide. If now the both index and data are 256-bit, but
23514 // the vector contains 8 elements, we just sign-extend the index
23515 if (IndexVT == MVT::v8i32)
23516 // Just extend index
23517 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23519 // The minimal number of elts in scatter is 8
23522 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23523 // Use original index here, do not modify the index twice
23524 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23525 if (IndexVT.getScalarType() == MVT::i32)
23526 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23529 // At this point we have promoted mask operand
23530 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23531 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23532 // Use the original mask here, do not modify the mask twice
23533 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23535 // The value that should be stored
23536 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23537 Src = ExtendToType(Src, NewVT, DAG);
23540 // If the mask is "wide" at this point - truncate it to i1 vector
23541 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23542 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23544 // The mask is killed by scatter, add it to the values
23545 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23546 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23547 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23548 N->getMemOperand());
23549 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23550 return SDValue(NewScatter.getNode(), 1);
23553 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23554 SelectionDAG &DAG) {
23556 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23557 MVT VT = Op.getSimpleValueType();
23558 MVT ScalarVT = VT.getScalarType();
23559 SDValue Mask = N->getMask();
23562 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23563 "Expanding masked load is supported on AVX-512 target only!");
23565 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23566 "Expanding masked load is supported for 32 and 64-bit types only!");
23568 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23569 // VLX. These types for exp-loads are handled here.
23570 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23573 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23574 "Cannot lower masked load op.");
23576 assert((ScalarVT.getSizeInBits() >= 32 ||
23577 (Subtarget.hasBWI() &&
23578 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23579 "Unsupported masked load op.");
23581 // This operation is legal for targets with VLX, but without
23582 // VLX the vector should be widened to 512 bit
23583 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23584 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23585 SDValue Src0 = N->getSrc0();
23586 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23588 // Mask element has to be i1.
23589 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23590 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23591 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23593 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23595 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23596 if (MaskEltTy != MVT::i1)
23597 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23598 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23599 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23600 N->getBasePtr(), Mask, Src0,
23601 N->getMemoryVT(), N->getMemOperand(),
23602 N->getExtensionType(),
23603 N->isExpandingLoad());
23605 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23606 NewLoad.getValue(0),
23607 DAG.getIntPtrConstant(0, dl));
23608 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23609 return DAG.getMergeValues(RetOps, dl);
23612 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23613 SelectionDAG &DAG) {
23614 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23615 SDValue DataToStore = N->getValue();
23616 MVT VT = DataToStore.getSimpleValueType();
23617 MVT ScalarVT = VT.getScalarType();
23618 SDValue Mask = N->getMask();
23621 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23622 "Expanding masked load is supported on AVX-512 target only!");
23624 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23625 "Expanding masked load is supported for 32 and 64-bit types only!");
23627 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23628 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23631 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23632 "Cannot lower masked store op.");
23634 assert((ScalarVT.getSizeInBits() >= 32 ||
23635 (Subtarget.hasBWI() &&
23636 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23637 "Unsupported masked store op.");
23639 // This operation is legal for targets with VLX, but without
23640 // VLX the vector should be widened to 512 bit
23641 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23642 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23644 // Mask element has to be i1.
23645 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23646 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23647 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23649 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23651 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23652 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23653 if (MaskEltTy != MVT::i1)
23654 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23655 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23656 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23657 Mask, N->getMemoryVT(), N->getMemOperand(),
23658 N->isTruncatingStore(), N->isCompressingStore());
23661 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23662 SelectionDAG &DAG) {
23663 assert(Subtarget.hasAVX512() &&
23664 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23666 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23668 MVT VT = Op.getSimpleValueType();
23669 SDValue Index = N->getIndex();
23670 SDValue Mask = N->getMask();
23671 SDValue Src0 = N->getValue();
23672 MVT IndexVT = Index.getSimpleValueType();
23673 MVT MaskVT = Mask.getSimpleValueType();
23675 unsigned NumElts = VT.getVectorNumElements();
23676 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23678 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23679 !Index.getSimpleValueType().is512BitVector()) {
23680 // AVX512F supports only 512-bit vectors. Or data or index should
23681 // be 512 bit wide. If now the both index and data are 256-bit, but
23682 // the vector contains 8 elements, we just sign-extend the index
23683 if (NumElts == 8) {
23684 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23685 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23686 N->getOperand(3), Index };
23687 DAG.UpdateNodeOperands(N, Ops);
23691 // Minimal number of elements in Gather
23694 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23695 Index = ExtendToType(Index, NewIndexVT, DAG);
23696 if (IndexVT.getScalarType() == MVT::i32)
23697 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23700 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23701 // At this point we have promoted mask operand
23702 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23703 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23704 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23705 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23707 // The pass-through value
23708 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23709 Src0 = ExtendToType(Src0, NewVT, DAG);
23711 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23712 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23713 N->getMemoryVT(), dl, Ops,
23714 N->getMemOperand());
23715 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23716 NewGather.getValue(0),
23717 DAG.getIntPtrConstant(0, dl));
23718 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23719 return DAG.getMergeValues(RetOps, dl);
23724 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23725 SelectionDAG &DAG) const {
23726 // TODO: Eventually, the lowering of these nodes should be informed by or
23727 // deferred to the GC strategy for the function in which they appear. For
23728 // now, however, they must be lowered to something. Since they are logically
23729 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23730 // require special handling for these nodes), lower them as literal NOOPs for
23732 SmallVector<SDValue, 2> Ops;
23734 Ops.push_back(Op.getOperand(0));
23735 if (Op->getGluedNode())
23736 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23739 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23740 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23745 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23746 SelectionDAG &DAG) const {
23747 // TODO: Eventually, the lowering of these nodes should be informed by or
23748 // deferred to the GC strategy for the function in which they appear. For
23749 // now, however, they must be lowered to something. Since they are logically
23750 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23751 // require special handling for these nodes), lower them as literal NOOPs for
23753 SmallVector<SDValue, 2> Ops;
23755 Ops.push_back(Op.getOperand(0));
23756 if (Op->getGluedNode())
23757 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23760 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23761 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23766 /// Provide custom lowering hooks for some operations.
23767 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23768 switch (Op.getOpcode()) {
23769 default: llvm_unreachable("Should not custom lower this!");
23770 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23771 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23772 return LowerCMP_SWAP(Op, Subtarget, DAG);
23773 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23774 case ISD::ATOMIC_LOAD_ADD:
23775 case ISD::ATOMIC_LOAD_SUB:
23776 case ISD::ATOMIC_LOAD_OR:
23777 case ISD::ATOMIC_LOAD_XOR:
23778 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23779 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23780 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23781 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23782 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23783 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23784 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23785 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23786 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23787 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23788 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23789 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23790 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23791 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23792 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23793 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23794 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23795 case ISD::SHL_PARTS:
23796 case ISD::SRA_PARTS:
23797 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23798 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23799 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23800 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23801 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23802 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23803 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23804 case ISD::ZERO_EXTEND_VECTOR_INREG:
23805 case ISD::SIGN_EXTEND_VECTOR_INREG:
23806 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23807 case ISD::FP_TO_SINT:
23808 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23809 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23810 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23812 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23813 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23814 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23815 case ISD::SETCC: return LowerSETCC(Op, DAG);
23816 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23817 case ISD::SELECT: return LowerSELECT(Op, DAG);
23818 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23819 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23820 case ISD::VASTART: return LowerVASTART(Op, DAG);
23821 case ISD::VAARG: return LowerVAARG(Op, DAG);
23822 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23823 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23824 case ISD::INTRINSIC_VOID:
23825 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23826 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23827 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23828 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23829 case ISD::FRAME_TO_ARGS_OFFSET:
23830 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23831 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23832 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23833 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23834 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23835 case ISD::EH_SJLJ_SETUP_DISPATCH:
23836 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23837 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23838 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23839 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23841 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23843 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23844 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23846 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23847 case ISD::UMUL_LOHI:
23848 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23849 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23852 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23858 case ISD::UMULO: return LowerXALUO(Op, DAG);
23859 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23860 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23864 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23866 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23870 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23871 case ISD::ABS: return LowerABS(Op, DAG);
23872 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23873 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23874 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23875 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23876 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23877 case ISD::GC_TRANSITION_START:
23878 return LowerGC_TRANSITION_START(Op, DAG);
23879 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23880 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23884 /// Places new result values for the node in Results (their number
23885 /// and types must exactly match those of the original return values of
23886 /// the node), or leaves Results empty, which indicates that the node is not
23887 /// to be custom lowered after all.
23888 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23889 SmallVectorImpl<SDValue> &Results,
23890 SelectionDAG &DAG) const {
23891 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23893 if (!Res.getNode())
23896 assert((N->getNumValues() <= Res->getNumValues()) &&
23897 "Lowering returned the wrong number of results!");
23899 // Places new result values base on N result number.
23900 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23901 // than original node, chain should be dropped(last value).
23902 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23903 Results.push_back(Res.getValue(I));
23906 /// Replace a node with an illegal result type with a new node built out of
23908 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23909 SmallVectorImpl<SDValue>&Results,
23910 SelectionDAG &DAG) const {
23912 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23913 switch (N->getOpcode()) {
23915 llvm_unreachable("Do not know how to custom type legalize this operation!");
23916 case X86ISD::AVG: {
23917 // Legalize types for X86ISD::AVG by expanding vectors.
23918 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23920 auto InVT = N->getValueType(0);
23921 auto InVTSize = InVT.getSizeInBits();
23922 const unsigned RegSize =
23923 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23924 assert((Subtarget.hasBWI() || RegSize < 512) &&
23925 "512-bit vector requires AVX512BW");
23926 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23927 "256-bit vector requires AVX2");
23929 auto ElemVT = InVT.getVectorElementType();
23930 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23931 RegSize / ElemVT.getSizeInBits());
23932 assert(RegSize % InVT.getSizeInBits() == 0);
23933 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23935 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23936 Ops[0] = N->getOperand(0);
23937 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23938 Ops[0] = N->getOperand(1);
23939 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23941 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23942 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23943 DAG.getIntPtrConstant(0, dl)));
23946 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23947 case X86ISD::FMINC:
23949 case X86ISD::FMAXC:
23950 case X86ISD::FMAX: {
23951 EVT VT = N->getValueType(0);
23952 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23953 SDValue UNDEF = DAG.getUNDEF(VT);
23954 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23955 N->getOperand(0), UNDEF);
23956 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23957 N->getOperand(1), UNDEF);
23958 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23966 case ISD::UDIVREM: {
23967 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23968 Results.push_back(V);
23971 case ISD::FP_TO_SINT:
23972 case ISD::FP_TO_UINT: {
23973 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23975 if (N->getValueType(0) == MVT::v2i32) {
23976 assert((IsSigned || Subtarget.hasAVX512()) &&
23977 "Can only handle signed conversion without AVX512");
23978 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23979 SDValue Src = N->getOperand(0);
23980 if (Src.getValueType() == MVT::v2f64) {
23981 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23982 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23983 : X86ISD::CVTTP2UI,
23984 dl, MVT::v4i32, Src);
23985 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23986 Results.push_back(Res);
23989 if (Src.getValueType() == MVT::v2f32) {
23990 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23991 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23992 DAG.getUNDEF(MVT::v2f32));
23993 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23994 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23995 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23996 Results.push_back(Res);
24000 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24001 // so early out here.
24005 std::pair<SDValue,SDValue> Vals =
24006 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24007 SDValue FIST = Vals.first, StackSlot = Vals.second;
24008 if (FIST.getNode()) {
24009 EVT VT = N->getValueType(0);
24010 // Return a load from the stack slot.
24011 if (StackSlot.getNode())
24013 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24015 Results.push_back(FIST);
24019 case ISD::SINT_TO_FP: {
24020 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24021 SDValue Src = N->getOperand(0);
24022 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24024 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24027 case ISD::UINT_TO_FP: {
24028 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24029 EVT VT = N->getValueType(0);
24030 if (VT != MVT::v2f32)
24032 SDValue Src = N->getOperand(0);
24033 EVT SrcVT = Src.getValueType();
24034 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24035 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24038 if (SrcVT != MVT::v2i32)
24040 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24042 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24043 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24044 DAG.getBitcast(MVT::v2i64, VBias));
24045 Or = DAG.getBitcast(MVT::v2f64, Or);
24046 // TODO: Are there any fast-math-flags to propagate here?
24047 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24048 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24051 case ISD::FP_ROUND: {
24052 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24054 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24055 Results.push_back(V);
24058 case ISD::FP_EXTEND: {
24059 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24060 // No other ValueType for FP_EXTEND should reach this point.
24061 assert(N->getValueType(0) == MVT::v2f32 &&
24062 "Do not know how to legalize this Node");
24065 case ISD::INTRINSIC_W_CHAIN: {
24066 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24068 default : llvm_unreachable("Do not know how to custom type "
24069 "legalize this intrinsic operation!");
24070 case Intrinsic::x86_rdtsc:
24071 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24073 case Intrinsic::x86_rdtscp:
24074 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24076 case Intrinsic::x86_rdpmc:
24077 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24079 case Intrinsic::x86_xgetbv:
24080 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24083 case ISD::INTRINSIC_WO_CHAIN: {
24084 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24085 Results.push_back(V);
24088 case ISD::READCYCLECOUNTER: {
24089 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24092 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24093 EVT T = N->getValueType(0);
24094 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24095 bool Regs64bit = T == MVT::i128;
24096 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24097 SDValue cpInL, cpInH;
24098 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24099 DAG.getConstant(0, dl, HalfT));
24100 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24101 DAG.getConstant(1, dl, HalfT));
24102 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24103 Regs64bit ? X86::RAX : X86::EAX,
24105 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24106 Regs64bit ? X86::RDX : X86::EDX,
24107 cpInH, cpInL.getValue(1));
24108 SDValue swapInL, swapInH;
24109 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24110 DAG.getConstant(0, dl, HalfT));
24111 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24112 DAG.getConstant(1, dl, HalfT));
24114 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24115 swapInH, cpInH.getValue(1));
24116 // If the current function needs the base pointer, RBX,
24117 // we shouldn't use cmpxchg directly.
24118 // Indeed the lowering of that instruction will clobber
24119 // that register and since RBX will be a reserved register
24120 // the register allocator will not make sure its value will
24121 // be properly saved and restored around this live-range.
24122 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24124 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24125 unsigned BasePtr = TRI->getBaseRegister();
24126 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24127 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24128 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24129 // ISel prefers the LCMPXCHG64 variant.
24130 // If that assert breaks, that means it is not the case anymore,
24131 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24132 // not just EBX. This is a matter of accepting i64 input for that
24133 // pseudo, and restoring into the register of the right wide
24134 // in expand pseudo. Everything else should just work.
24135 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24136 "Saving only half of the RBX");
24137 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24138 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24139 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24140 Regs64bit ? X86::RBX : X86::EBX,
24141 HalfT, swapInH.getValue(1));
24142 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24144 /*Glue*/ RBXSave.getValue(2)};
24145 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24148 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24149 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24150 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24151 swapInH.getValue(1));
24152 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24153 swapInL.getValue(1)};
24154 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24156 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24157 Regs64bit ? X86::RAX : X86::EAX,
24158 HalfT, Result.getValue(1));
24159 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24160 Regs64bit ? X86::RDX : X86::EDX,
24161 HalfT, cpOutL.getValue(2));
24162 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24164 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24165 MVT::i32, cpOutH.getValue(2));
24166 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24167 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24169 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24170 Results.push_back(Success);
24171 Results.push_back(EFLAGS.getValue(1));
24174 case ISD::ATOMIC_SWAP:
24175 case ISD::ATOMIC_LOAD_ADD:
24176 case ISD::ATOMIC_LOAD_SUB:
24177 case ISD::ATOMIC_LOAD_AND:
24178 case ISD::ATOMIC_LOAD_OR:
24179 case ISD::ATOMIC_LOAD_XOR:
24180 case ISD::ATOMIC_LOAD_NAND:
24181 case ISD::ATOMIC_LOAD_MIN:
24182 case ISD::ATOMIC_LOAD_MAX:
24183 case ISD::ATOMIC_LOAD_UMIN:
24184 case ISD::ATOMIC_LOAD_UMAX:
24185 case ISD::ATOMIC_LOAD: {
24186 // Delegate to generic TypeLegalization. Situations we can really handle
24187 // should have already been dealt with by AtomicExpandPass.cpp.
24190 case ISD::BITCAST: {
24191 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24192 EVT DstVT = N->getValueType(0);
24193 EVT SrcVT = N->getOperand(0)->getValueType(0);
24195 if (SrcVT != MVT::f64 ||
24196 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24199 unsigned NumElts = DstVT.getVectorNumElements();
24200 EVT SVT = DstVT.getVectorElementType();
24201 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24202 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24203 MVT::v2f64, N->getOperand(0));
24204 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24206 if (ExperimentalVectorWideningLegalization) {
24207 // If we are legalizing vectors by widening, we already have the desired
24208 // legal vector type, just return it.
24209 Results.push_back(ToVecInt);
24213 SmallVector<SDValue, 8> Elts;
24214 for (unsigned i = 0, e = NumElts; i != e; ++i)
24215 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24216 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24218 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24223 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24224 switch ((X86ISD::NodeType)Opcode) {
24225 case X86ISD::FIRST_NUMBER: break;
24226 case X86ISD::BSF: return "X86ISD::BSF";
24227 case X86ISD::BSR: return "X86ISD::BSR";
24228 case X86ISD::SHLD: return "X86ISD::SHLD";
24229 case X86ISD::SHRD: return "X86ISD::SHRD";
24230 case X86ISD::FAND: return "X86ISD::FAND";
24231 case X86ISD::FANDN: return "X86ISD::FANDN";
24232 case X86ISD::FOR: return "X86ISD::FOR";
24233 case X86ISD::FXOR: return "X86ISD::FXOR";
24234 case X86ISD::FILD: return "X86ISD::FILD";
24235 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24236 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24237 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24238 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24239 case X86ISD::FLD: return "X86ISD::FLD";
24240 case X86ISD::FST: return "X86ISD::FST";
24241 case X86ISD::CALL: return "X86ISD::CALL";
24242 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24243 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24244 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24245 case X86ISD::BT: return "X86ISD::BT";
24246 case X86ISD::CMP: return "X86ISD::CMP";
24247 case X86ISD::COMI: return "X86ISD::COMI";
24248 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24249 case X86ISD::CMPM: return "X86ISD::CMPM";
24250 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24251 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24252 case X86ISD::SETCC: return "X86ISD::SETCC";
24253 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24254 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24255 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24256 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24257 case X86ISD::CMOV: return "X86ISD::CMOV";
24258 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24259 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24260 case X86ISD::IRET: return "X86ISD::IRET";
24261 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24262 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24263 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24264 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24265 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24266 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24267 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24268 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24269 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24270 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24271 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24272 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24273 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24274 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24275 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24276 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24277 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24278 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24279 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24280 case X86ISD::HADD: return "X86ISD::HADD";
24281 case X86ISD::HSUB: return "X86ISD::HSUB";
24282 case X86ISD::FHADD: return "X86ISD::FHADD";
24283 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24284 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24285 case X86ISD::FMAX: return "X86ISD::FMAX";
24286 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24287 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24288 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24289 case X86ISD::FMIN: return "X86ISD::FMIN";
24290 case X86ISD::FMINS: return "X86ISD::FMINS";
24291 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24292 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24293 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24294 case X86ISD::FMINC: return "X86ISD::FMINC";
24295 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24296 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24297 case X86ISD::FRCP: return "X86ISD::FRCP";
24298 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24299 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24300 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24301 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24302 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24303 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24304 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24305 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24306 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24307 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24308 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24309 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24310 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24311 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24312 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24313 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24314 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24315 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24316 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24317 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24318 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24319 case X86ISD::LADD: return "X86ISD::LADD";
24320 case X86ISD::LSUB: return "X86ISD::LSUB";
24321 case X86ISD::LOR: return "X86ISD::LOR";
24322 case X86ISD::LXOR: return "X86ISD::LXOR";
24323 case X86ISD::LAND: return "X86ISD::LAND";
24324 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24325 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24326 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24327 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24328 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24329 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24330 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24331 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24332 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24333 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24334 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24335 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24336 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24337 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24338 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24339 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24340 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24341 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24342 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24343 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24344 case X86ISD::VSHL: return "X86ISD::VSHL";
24345 case X86ISD::VSRL: return "X86ISD::VSRL";
24346 case X86ISD::VSRA: return "X86ISD::VSRA";
24347 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24348 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24349 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24350 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24351 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24352 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24353 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24354 case X86ISD::CMPP: return "X86ISD::CMPP";
24355 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24356 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24357 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24358 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24359 case X86ISD::ADD: return "X86ISD::ADD";
24360 case X86ISD::SUB: return "X86ISD::SUB";
24361 case X86ISD::ADC: return "X86ISD::ADC";
24362 case X86ISD::SBB: return "X86ISD::SBB";
24363 case X86ISD::SMUL: return "X86ISD::SMUL";
24364 case X86ISD::UMUL: return "X86ISD::UMUL";
24365 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24366 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24367 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24368 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24369 case X86ISD::INC: return "X86ISD::INC";
24370 case X86ISD::DEC: return "X86ISD::DEC";
24371 case X86ISD::OR: return "X86ISD::OR";
24372 case X86ISD::XOR: return "X86ISD::XOR";
24373 case X86ISD::AND: return "X86ISD::AND";
24374 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24375 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24376 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24377 case X86ISD::PTEST: return "X86ISD::PTEST";
24378 case X86ISD::TESTP: return "X86ISD::TESTP";
24379 case X86ISD::TESTM: return "X86ISD::TESTM";
24380 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24381 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24382 case X86ISD::KTEST: return "X86ISD::KTEST";
24383 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24384 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24385 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24386 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24387 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24388 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24389 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24390 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24391 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24392 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24393 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24394 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24395 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24396 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24397 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24398 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24399 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24400 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24401 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24402 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24403 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24404 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24405 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24406 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24407 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24408 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24409 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24410 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24411 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24412 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24413 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24414 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24415 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24416 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24417 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24418 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24419 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24420 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24421 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24422 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24423 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24424 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24425 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24426 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24427 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24428 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24429 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24430 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24431 case X86ISD::SAHF: return "X86ISD::SAHF";
24432 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24433 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24434 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24435 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24436 case X86ISD::VPROT: return "X86ISD::VPROT";
24437 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24438 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24439 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24440 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24441 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24442 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24443 case X86ISD::FMADD: return "X86ISD::FMADD";
24444 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24445 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24446 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24447 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24448 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24449 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24450 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24451 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24452 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24453 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24454 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24455 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24456 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24457 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24458 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24459 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24460 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24461 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24462 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24463 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24464 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24465 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24466 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24467 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24468 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24469 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24470 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24471 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24472 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24473 case X86ISD::XTEST: return "X86ISD::XTEST";
24474 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24475 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24476 case X86ISD::SELECT: return "X86ISD::SELECT";
24477 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24478 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24479 case X86ISD::RCP28: return "X86ISD::RCP28";
24480 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24481 case X86ISD::EXP2: return "X86ISD::EXP2";
24482 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24483 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24484 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24485 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24486 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24487 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24488 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24489 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24490 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24491 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24492 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24493 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24494 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24495 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24496 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24497 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24498 case X86ISD::ADDS: return "X86ISD::ADDS";
24499 case X86ISD::SUBS: return "X86ISD::SUBS";
24500 case X86ISD::AVG: return "X86ISD::AVG";
24501 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24502 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24503 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24504 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24505 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24506 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24507 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24508 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24509 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24510 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24511 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24512 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24513 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24514 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24515 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24516 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24517 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24518 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24519 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24520 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24521 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24522 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24523 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24524 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24529 /// Return true if the addressing mode represented by AM is legal for this
24530 /// target, for a load/store of the specified type.
24531 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24532 const AddrMode &AM, Type *Ty,
24533 unsigned AS) const {
24534 // X86 supports extremely general addressing modes.
24535 CodeModel::Model M = getTargetMachine().getCodeModel();
24537 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24538 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24542 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24544 // If a reference to this global requires an extra load, we can't fold it.
24545 if (isGlobalStubReference(GVFlags))
24548 // If BaseGV requires a register for the PIC base, we cannot also have a
24549 // BaseReg specified.
24550 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24553 // If lower 4G is not available, then we must use rip-relative addressing.
24554 if ((M != CodeModel::Small || isPositionIndependent()) &&
24555 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24559 switch (AM.Scale) {
24565 // These scales always work.
24570 // These scales are formed with basereg+scalereg. Only accept if there is
24575 default: // Other stuff never works.
24582 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24583 unsigned Bits = Ty->getScalarSizeInBits();
24585 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24586 // particularly cheaper than those without.
24590 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24591 // variable shifts just as cheap as scalar ones.
24592 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24595 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24596 // fully general vector.
24600 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24601 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24603 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24604 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24605 return NumBits1 > NumBits2;
24608 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24609 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24612 if (!isTypeLegal(EVT::getEVT(Ty1)))
24615 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24617 // Assuming the caller doesn't have a zeroext or signext return parameter,
24618 // truncation all the way down to i1 is valid.
24622 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24623 return isInt<32>(Imm);
24626 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24627 // Can also use sub to handle negated immediates.
24628 return isInt<32>(Imm);
24631 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24632 if (!VT1.isInteger() || !VT2.isInteger())
24634 unsigned NumBits1 = VT1.getSizeInBits();
24635 unsigned NumBits2 = VT2.getSizeInBits();
24636 return NumBits1 > NumBits2;
24639 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24640 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24641 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24644 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24645 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24646 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24649 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24650 EVT VT1 = Val.getValueType();
24651 if (isZExtFree(VT1, VT2))
24654 if (Val.getOpcode() != ISD::LOAD)
24657 if (!VT1.isSimple() || !VT1.isInteger() ||
24658 !VT2.isSimple() || !VT2.isInteger())
24661 switch (VT1.getSimpleVT().SimpleTy) {
24666 // X86 has 8, 16, and 32-bit zero-extending loads.
24673 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24676 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24677 if (!Subtarget.hasAnyFMA())
24680 VT = VT.getScalarType();
24682 if (!VT.isSimple())
24685 switch (VT.getSimpleVT().SimpleTy) {
24696 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24697 // i16 instructions are longer (0x66 prefix) and potentially slower.
24698 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24701 /// Targets can use this to indicate that they only support *some*
24702 /// VECTOR_SHUFFLE operations, those with specific masks.
24703 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24704 /// are assumed to be legal.
24706 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24708 if (!VT.isSimple())
24711 // Not for i1 vectors
24712 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24715 // Very little shuffling can be done for 64-bit vectors right now.
24716 if (VT.getSimpleVT().getSizeInBits() == 64)
24719 // We only care that the types being shuffled are legal. The lowering can
24720 // handle any possible shuffle mask that results.
24721 return isTypeLegal(VT.getSimpleVT());
24725 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24727 // Just delegate to the generic legality, clear masks aren't special.
24728 return isShuffleMaskLegal(Mask, VT);
24731 //===----------------------------------------------------------------------===//
24732 // X86 Scheduler Hooks
24733 //===----------------------------------------------------------------------===//
24735 /// Utility function to emit xbegin specifying the start of an RTM region.
24736 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24737 const TargetInstrInfo *TII) {
24738 DebugLoc DL = MI.getDebugLoc();
24740 const BasicBlock *BB = MBB->getBasicBlock();
24741 MachineFunction::iterator I = ++MBB->getIterator();
24743 // For the v = xbegin(), we generate
24754 MachineBasicBlock *thisMBB = MBB;
24755 MachineFunction *MF = MBB->getParent();
24756 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24757 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24758 MF->insert(I, mainMBB);
24759 MF->insert(I, sinkMBB);
24761 // Transfer the remainder of BB and its successor edges to sinkMBB.
24762 sinkMBB->splice(sinkMBB->begin(), MBB,
24763 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24764 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24768 // # fallthrough to mainMBB
24769 // # abortion to sinkMBB
24770 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
24771 thisMBB->addSuccessor(mainMBB);
24772 thisMBB->addSuccessor(sinkMBB);
24776 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
24777 mainMBB->addSuccessor(sinkMBB);
24780 // EAX is live into the sinkMBB
24781 sinkMBB->addLiveIn(X86::EAX);
24782 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24783 MI.getOperand(0).getReg())
24786 MI.eraseFromParent();
24790 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24791 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24792 // in the .td file.
24793 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24794 const TargetInstrInfo *TII) {
24796 switch (MI.getOpcode()) {
24797 default: llvm_unreachable("illegal opcode!");
24798 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24799 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24800 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24801 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24802 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24803 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24804 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24805 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24808 DebugLoc dl = MI.getDebugLoc();
24809 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24811 unsigned NumArgs = MI.getNumOperands();
24812 for (unsigned i = 1; i < NumArgs; ++i) {
24813 MachineOperand &Op = MI.getOperand(i);
24814 if (!(Op.isReg() && Op.isImplicit()))
24817 if (MI.hasOneMemOperand())
24818 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24820 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24821 .addReg(X86::XMM0);
24823 MI.eraseFromParent();
24827 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24828 // defs in an instruction pattern
24829 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24830 const TargetInstrInfo *TII) {
24832 switch (MI.getOpcode()) {
24833 default: llvm_unreachable("illegal opcode!");
24834 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24835 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24836 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24837 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24838 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24839 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24840 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24841 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24844 DebugLoc dl = MI.getDebugLoc();
24845 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24847 unsigned NumArgs = MI.getNumOperands(); // remove the results
24848 for (unsigned i = 1; i < NumArgs; ++i) {
24849 MachineOperand &Op = MI.getOperand(i);
24850 if (!(Op.isReg() && Op.isImplicit()))
24853 if (MI.hasOneMemOperand())
24854 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24856 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24859 MI.eraseFromParent();
24863 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24864 const X86Subtarget &Subtarget) {
24865 DebugLoc dl = MI.getDebugLoc();
24866 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24868 // insert input VAL into EAX
24869 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24870 .addReg(MI.getOperand(0).getReg());
24871 // insert zero to ECX
24872 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24874 // insert zero to EDX
24875 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24877 // insert WRPKRU instruction
24878 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24880 MI.eraseFromParent(); // The pseudo is gone now.
24884 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24885 const X86Subtarget &Subtarget) {
24886 DebugLoc dl = MI.getDebugLoc();
24887 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24889 // insert zero to ECX
24890 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24892 // insert RDPKRU instruction
24893 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24894 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24897 MI.eraseFromParent(); // The pseudo is gone now.
24901 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24902 const X86Subtarget &Subtarget,
24904 DebugLoc dl = MI.getDebugLoc();
24905 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24906 // Address into RAX/EAX, other two args into ECX, EDX.
24907 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24908 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24909 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24910 for (int i = 0; i < X86::AddrNumOperands; ++i)
24911 MIB.add(MI.getOperand(i));
24913 unsigned ValOps = X86::AddrNumOperands;
24914 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24915 .addReg(MI.getOperand(ValOps).getReg());
24916 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24917 .addReg(MI.getOperand(ValOps + 1).getReg());
24919 // The instruction doesn't actually take any operands though.
24920 BuildMI(*BB, MI, dl, TII->get(Opc));
24922 MI.eraseFromParent(); // The pseudo is gone now.
24926 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24927 const X86Subtarget &Subtarget) {
24928 DebugLoc dl = MI->getDebugLoc();
24929 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24930 // Address into RAX/EAX
24931 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24932 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24933 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24934 for (int i = 0; i < X86::AddrNumOperands; ++i)
24935 MIB.add(MI->getOperand(i));
24937 // The instruction doesn't actually take any operands though.
24938 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24940 MI->eraseFromParent(); // The pseudo is gone now.
24946 MachineBasicBlock *
24947 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24948 MachineBasicBlock *MBB) const {
24949 // Emit va_arg instruction on X86-64.
24951 // Operands to this pseudo-instruction:
24952 // 0 ) Output : destination address (reg)
24953 // 1-5) Input : va_list address (addr, i64mem)
24954 // 6 ) ArgSize : Size (in bytes) of vararg type
24955 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24956 // 8 ) Align : Alignment of type
24957 // 9 ) EFLAGS (implicit-def)
24959 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24960 static_assert(X86::AddrNumOperands == 5,
24961 "VAARG_64 assumes 5 address operands");
24963 unsigned DestReg = MI.getOperand(0).getReg();
24964 MachineOperand &Base = MI.getOperand(1);
24965 MachineOperand &Scale = MI.getOperand(2);
24966 MachineOperand &Index = MI.getOperand(3);
24967 MachineOperand &Disp = MI.getOperand(4);
24968 MachineOperand &Segment = MI.getOperand(5);
24969 unsigned ArgSize = MI.getOperand(6).getImm();
24970 unsigned ArgMode = MI.getOperand(7).getImm();
24971 unsigned Align = MI.getOperand(8).getImm();
24973 // Memory Reference
24974 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24975 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24976 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24978 // Machine Information
24979 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24980 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24981 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24982 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24983 DebugLoc DL = MI.getDebugLoc();
24985 // struct va_list {
24988 // i64 overflow_area (address)
24989 // i64 reg_save_area (address)
24991 // sizeof(va_list) = 24
24992 // alignment(va_list) = 8
24994 unsigned TotalNumIntRegs = 6;
24995 unsigned TotalNumXMMRegs = 8;
24996 bool UseGPOffset = (ArgMode == 1);
24997 bool UseFPOffset = (ArgMode == 2);
24998 unsigned MaxOffset = TotalNumIntRegs * 8 +
24999 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25001 /* Align ArgSize to a multiple of 8 */
25002 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25003 bool NeedsAlign = (Align > 8);
25005 MachineBasicBlock *thisMBB = MBB;
25006 MachineBasicBlock *overflowMBB;
25007 MachineBasicBlock *offsetMBB;
25008 MachineBasicBlock *endMBB;
25010 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25011 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25012 unsigned OffsetReg = 0;
25014 if (!UseGPOffset && !UseFPOffset) {
25015 // If we only pull from the overflow region, we don't create a branch.
25016 // We don't need to alter control flow.
25017 OffsetDestReg = 0; // unused
25018 OverflowDestReg = DestReg;
25020 offsetMBB = nullptr;
25021 overflowMBB = thisMBB;
25024 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25025 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25026 // If not, pull from overflow_area. (branch to overflowMBB)
25031 // offsetMBB overflowMBB
25036 // Registers for the PHI in endMBB
25037 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25038 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25040 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25041 MachineFunction *MF = MBB->getParent();
25042 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25043 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25044 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25046 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25048 // Insert the new basic blocks
25049 MF->insert(MBBIter, offsetMBB);
25050 MF->insert(MBBIter, overflowMBB);
25051 MF->insert(MBBIter, endMBB);
25053 // Transfer the remainder of MBB and its successor edges to endMBB.
25054 endMBB->splice(endMBB->begin(), thisMBB,
25055 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25056 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25058 // Make offsetMBB and overflowMBB successors of thisMBB
25059 thisMBB->addSuccessor(offsetMBB);
25060 thisMBB->addSuccessor(overflowMBB);
25062 // endMBB is a successor of both offsetMBB and overflowMBB
25063 offsetMBB->addSuccessor(endMBB);
25064 overflowMBB->addSuccessor(endMBB);
25066 // Load the offset value into a register
25067 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25068 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25072 .addDisp(Disp, UseFPOffset ? 4 : 0)
25074 .setMemRefs(MMOBegin, MMOEnd);
25076 // Check if there is enough room left to pull this argument.
25077 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25079 .addImm(MaxOffset + 8 - ArgSizeA8);
25081 // Branch to "overflowMBB" if offset >= max
25082 // Fall through to "offsetMBB" otherwise
25083 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25084 .addMBB(overflowMBB);
25087 // In offsetMBB, emit code to use the reg_save_area.
25089 assert(OffsetReg != 0);
25091 // Read the reg_save_area address.
25092 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25093 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25099 .setMemRefs(MMOBegin, MMOEnd);
25101 // Zero-extend the offset
25102 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25103 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25106 .addImm(X86::sub_32bit);
25108 // Add the offset to the reg_save_area to get the final address.
25109 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25110 .addReg(OffsetReg64)
25111 .addReg(RegSaveReg);
25113 // Compute the offset for the next argument
25114 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25115 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25117 .addImm(UseFPOffset ? 16 : 8);
25119 // Store it back into the va_list.
25120 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25124 .addDisp(Disp, UseFPOffset ? 4 : 0)
25126 .addReg(NextOffsetReg)
25127 .setMemRefs(MMOBegin, MMOEnd);
25130 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25135 // Emit code to use overflow area
25138 // Load the overflow_area address into a register.
25139 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25140 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25146 .setMemRefs(MMOBegin, MMOEnd);
25148 // If we need to align it, do so. Otherwise, just copy the address
25149 // to OverflowDestReg.
25151 // Align the overflow address
25152 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25153 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25155 // aligned_addr = (addr + (align-1)) & ~(align-1)
25156 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25157 .addReg(OverflowAddrReg)
25160 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25162 .addImm(~(uint64_t)(Align-1));
25164 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25165 .addReg(OverflowAddrReg);
25168 // Compute the next overflow address after this argument.
25169 // (the overflow address should be kept 8-byte aligned)
25170 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25171 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25172 .addReg(OverflowDestReg)
25173 .addImm(ArgSizeA8);
25175 // Store the new overflow address.
25176 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25182 .addReg(NextAddrReg)
25183 .setMemRefs(MMOBegin, MMOEnd);
25185 // If we branched, emit the PHI to the front of endMBB.
25187 BuildMI(*endMBB, endMBB->begin(), DL,
25188 TII->get(X86::PHI), DestReg)
25189 .addReg(OffsetDestReg).addMBB(offsetMBB)
25190 .addReg(OverflowDestReg).addMBB(overflowMBB);
25193 // Erase the pseudo instruction
25194 MI.eraseFromParent();
25199 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25200 MachineInstr &MI, MachineBasicBlock *MBB) const {
25201 // Emit code to save XMM registers to the stack. The ABI says that the
25202 // number of registers to save is given in %al, so it's theoretically
25203 // possible to do an indirect jump trick to avoid saving all of them,
25204 // however this code takes a simpler approach and just executes all
25205 // of the stores if %al is non-zero. It's less code, and it's probably
25206 // easier on the hardware branch predictor, and stores aren't all that
25207 // expensive anyway.
25209 // Create the new basic blocks. One block contains all the XMM stores,
25210 // and one block is the final destination regardless of whether any
25211 // stores were performed.
25212 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25213 MachineFunction *F = MBB->getParent();
25214 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25215 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25216 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25217 F->insert(MBBIter, XMMSaveMBB);
25218 F->insert(MBBIter, EndMBB);
25220 // Transfer the remainder of MBB and its successor edges to EndMBB.
25221 EndMBB->splice(EndMBB->begin(), MBB,
25222 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25223 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25225 // The original block will now fall through to the XMM save block.
25226 MBB->addSuccessor(XMMSaveMBB);
25227 // The XMMSaveMBB will fall through to the end block.
25228 XMMSaveMBB->addSuccessor(EndMBB);
25230 // Now add the instructions.
25231 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25232 DebugLoc DL = MI.getDebugLoc();
25234 unsigned CountReg = MI.getOperand(0).getReg();
25235 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25236 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25238 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25239 // If %al is 0, branch around the XMM save block.
25240 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25241 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25242 MBB->addSuccessor(EndMBB);
25245 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25246 // that was just emitted, but clearly shouldn't be "saved".
25247 assert((MI.getNumOperands() <= 3 ||
25248 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25249 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25250 "Expected last argument to be EFLAGS");
25251 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25252 // In the XMM save block, save all the XMM argument registers.
25253 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25254 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25255 MachineMemOperand *MMO = F->getMachineMemOperand(
25256 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25257 MachineMemOperand::MOStore,
25258 /*Size=*/16, /*Align=*/16);
25259 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25260 .addFrameIndex(RegSaveFrameIndex)
25261 .addImm(/*Scale=*/1)
25262 .addReg(/*IndexReg=*/0)
25263 .addImm(/*Disp=*/Offset)
25264 .addReg(/*Segment=*/0)
25265 .addReg(MI.getOperand(i).getReg())
25266 .addMemOperand(MMO);
25269 MI.eraseFromParent(); // The pseudo instruction is gone now.
25274 // The EFLAGS operand of SelectItr might be missing a kill marker
25275 // because there were multiple uses of EFLAGS, and ISel didn't know
25276 // which to mark. Figure out whether SelectItr should have had a
25277 // kill marker, and set it if it should. Returns the correct kill
25279 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25280 MachineBasicBlock* BB,
25281 const TargetRegisterInfo* TRI) {
25282 // Scan forward through BB for a use/def of EFLAGS.
25283 MachineBasicBlock::iterator miI(std::next(SelectItr));
25284 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25285 const MachineInstr& mi = *miI;
25286 if (mi.readsRegister(X86::EFLAGS))
25288 if (mi.definesRegister(X86::EFLAGS))
25289 break; // Should have kill-flag - update below.
25292 // If we hit the end of the block, check whether EFLAGS is live into a
25294 if (miI == BB->end()) {
25295 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25296 sEnd = BB->succ_end();
25297 sItr != sEnd; ++sItr) {
25298 MachineBasicBlock* succ = *sItr;
25299 if (succ->isLiveIn(X86::EFLAGS))
25304 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25305 // out. SelectMI should have a kill flag on EFLAGS.
25306 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25310 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25311 // together with other CMOV pseudo-opcodes into a single basic-block with
25312 // conditional jump around it.
25313 static bool isCMOVPseudo(MachineInstr &MI) {
25314 switch (MI.getOpcode()) {
25315 case X86::CMOV_FR32:
25316 case X86::CMOV_FR64:
25317 case X86::CMOV_GR8:
25318 case X86::CMOV_GR16:
25319 case X86::CMOV_GR32:
25320 case X86::CMOV_RFP32:
25321 case X86::CMOV_RFP64:
25322 case X86::CMOV_RFP80:
25323 case X86::CMOV_V2F64:
25324 case X86::CMOV_V2I64:
25325 case X86::CMOV_V4F32:
25326 case X86::CMOV_V4F64:
25327 case X86::CMOV_V4I64:
25328 case X86::CMOV_V16F32:
25329 case X86::CMOV_V8F32:
25330 case X86::CMOV_V8F64:
25331 case X86::CMOV_V8I64:
25332 case X86::CMOV_V8I1:
25333 case X86::CMOV_V16I1:
25334 case X86::CMOV_V32I1:
25335 case X86::CMOV_V64I1:
25343 MachineBasicBlock *
25344 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25345 MachineBasicBlock *BB) const {
25346 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25347 DebugLoc DL = MI.getDebugLoc();
25349 // To "insert" a SELECT_CC instruction, we actually have to insert the
25350 // diamond control-flow pattern. The incoming instruction knows the
25351 // destination vreg to set, the condition code register to branch on, the
25352 // true/false values to select between, and a branch opcode to use.
25353 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25354 MachineFunction::iterator It = ++BB->getIterator();
25359 // cmpTY ccX, r1, r2
25361 // fallthrough --> copy0MBB
25362 MachineBasicBlock *thisMBB = BB;
25363 MachineFunction *F = BB->getParent();
25365 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25366 // as described above, by inserting a BB, and then making a PHI at the join
25367 // point to select the true and false operands of the CMOV in the PHI.
25369 // The code also handles two different cases of multiple CMOV opcodes
25373 // In this case, there are multiple CMOVs in a row, all which are based on
25374 // the same condition setting (or the exact opposite condition setting).
25375 // In this case we can lower all the CMOVs using a single inserted BB, and
25376 // then make a number of PHIs at the join point to model the CMOVs. The only
25377 // trickiness here, is that in a case like:
25379 // t2 = CMOV cond1 t1, f1
25380 // t3 = CMOV cond1 t2, f2
25382 // when rewriting this into PHIs, we have to perform some renaming on the
25383 // temps since you cannot have a PHI operand refer to a PHI result earlier
25384 // in the same block. The "simple" but wrong lowering would be:
25386 // t2 = PHI t1(BB1), f1(BB2)
25387 // t3 = PHI t2(BB1), f2(BB2)
25389 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25390 // renaming is to note that on the path through BB1, t2 is really just a
25391 // copy of t1, and do that renaming, properly generating:
25393 // t2 = PHI t1(BB1), f1(BB2)
25394 // t3 = PHI t1(BB1), f2(BB2)
25396 // Case 2, we lower cascaded CMOVs such as
25398 // (CMOV (CMOV F, T, cc1), T, cc2)
25400 // to two successive branches. For that, we look for another CMOV as the
25401 // following instruction.
25403 // Without this, we would add a PHI between the two jumps, which ends up
25404 // creating a few copies all around. For instance, for
25406 // (sitofp (zext (fcmp une)))
25408 // we would generate:
25410 // ucomiss %xmm1, %xmm0
25411 // movss <1.0f>, %xmm0
25412 // movaps %xmm0, %xmm1
25414 // xorps %xmm1, %xmm1
25417 // movaps %xmm1, %xmm0
25421 // because this custom-inserter would have generated:
25433 // A: X = ...; Y = ...
25435 // C: Z = PHI [X, A], [Y, B]
25437 // E: PHI [X, C], [Z, D]
25439 // If we lower both CMOVs in a single step, we can instead generate:
25451 // A: X = ...; Y = ...
25453 // E: PHI [X, A], [X, C], [Y, D]
25455 // Which, in our sitofp/fcmp example, gives us something like:
25457 // ucomiss %xmm1, %xmm0
25458 // movss <1.0f>, %xmm0
25461 // xorps %xmm0, %xmm0
25465 MachineInstr *CascadedCMOV = nullptr;
25466 MachineInstr *LastCMOV = &MI;
25467 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25468 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25469 MachineBasicBlock::iterator NextMIIt =
25470 std::next(MachineBasicBlock::iterator(MI));
25472 // Check for case 1, where there are multiple CMOVs with the same condition
25473 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25474 // number of jumps the most.
25476 if (isCMOVPseudo(MI)) {
25477 // See if we have a string of CMOVS with the same condition.
25478 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25479 (NextMIIt->getOperand(3).getImm() == CC ||
25480 NextMIIt->getOperand(3).getImm() == OppCC)) {
25481 LastCMOV = &*NextMIIt;
25486 // This checks for case 2, but only do this if we didn't already find
25487 // case 1, as indicated by LastCMOV == MI.
25488 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25489 NextMIIt->getOpcode() == MI.getOpcode() &&
25490 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25491 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25492 NextMIIt->getOperand(1).isKill()) {
25493 CascadedCMOV = &*NextMIIt;
25496 MachineBasicBlock *jcc1MBB = nullptr;
25498 // If we have a cascaded CMOV, we lower it to two successive branches to
25499 // the same block. EFLAGS is used by both, so mark it as live in the second.
25500 if (CascadedCMOV) {
25501 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25502 F->insert(It, jcc1MBB);
25503 jcc1MBB->addLiveIn(X86::EFLAGS);
25506 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25507 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25508 F->insert(It, copy0MBB);
25509 F->insert(It, sinkMBB);
25511 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25512 // live into the sink and copy blocks.
25513 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25515 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25516 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25517 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25518 copy0MBB->addLiveIn(X86::EFLAGS);
25519 sinkMBB->addLiveIn(X86::EFLAGS);
25522 // Transfer the remainder of BB and its successor edges to sinkMBB.
25523 sinkMBB->splice(sinkMBB->begin(), BB,
25524 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25525 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25527 // Add the true and fallthrough blocks as its successors.
25528 if (CascadedCMOV) {
25529 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25530 BB->addSuccessor(jcc1MBB);
25532 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25533 // jump to the sinkMBB.
25534 jcc1MBB->addSuccessor(copy0MBB);
25535 jcc1MBB->addSuccessor(sinkMBB);
25537 BB->addSuccessor(copy0MBB);
25540 // The true block target of the first (or only) branch is always sinkMBB.
25541 BB->addSuccessor(sinkMBB);
25543 // Create the conditional branch instruction.
25544 unsigned Opc = X86::GetCondBranchFromCond(CC);
25545 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25547 if (CascadedCMOV) {
25548 unsigned Opc2 = X86::GetCondBranchFromCond(
25549 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25550 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25554 // %FalseValue = ...
25555 // # fallthrough to sinkMBB
25556 copy0MBB->addSuccessor(sinkMBB);
25559 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25561 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25562 MachineBasicBlock::iterator MIItEnd =
25563 std::next(MachineBasicBlock::iterator(LastCMOV));
25564 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25565 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25566 MachineInstrBuilder MIB;
25568 // As we are creating the PHIs, we have to be careful if there is more than
25569 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25570 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25571 // That also means that PHI construction must work forward from earlier to
25572 // later, and that the code must maintain a mapping from earlier PHI's
25573 // destination registers, and the registers that went into the PHI.
25575 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25576 unsigned DestReg = MIIt->getOperand(0).getReg();
25577 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25578 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25580 // If this CMOV we are generating is the opposite condition from
25581 // the jump we generated, then we have to swap the operands for the
25582 // PHI that is going to be generated.
25583 if (MIIt->getOperand(3).getImm() == OppCC)
25584 std::swap(Op1Reg, Op2Reg);
25586 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25587 Op1Reg = RegRewriteTable[Op1Reg].first;
25589 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25590 Op2Reg = RegRewriteTable[Op2Reg].second;
25592 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25593 TII->get(X86::PHI), DestReg)
25594 .addReg(Op1Reg).addMBB(copy0MBB)
25595 .addReg(Op2Reg).addMBB(thisMBB);
25597 // Add this PHI to the rewrite table.
25598 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25601 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25602 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25603 if (CascadedCMOV) {
25604 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25605 // Copy the PHI result to the register defined by the second CMOV.
25606 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25607 DL, TII->get(TargetOpcode::COPY),
25608 CascadedCMOV->getOperand(0).getReg())
25609 .addReg(MI.getOperand(0).getReg());
25610 CascadedCMOV->eraseFromParent();
25613 // Now remove the CMOV(s).
25614 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25615 (MIIt++)->eraseFromParent();
25620 MachineBasicBlock *
25621 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25622 MachineBasicBlock *BB) const {
25623 // Combine the following atomic floating-point modification pattern:
25624 // a.store(reg OP a.load(acquire), release)
25625 // Transform them into:
25626 // OPss (%gpr), %xmm
25627 // movss %xmm, (%gpr)
25628 // Or sd equivalent for 64-bit operations.
25630 switch (MI.getOpcode()) {
25631 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25632 case X86::RELEASE_FADD32mr:
25633 FOp = X86::ADDSSrm;
25634 MOp = X86::MOVSSmr;
25636 case X86::RELEASE_FADD64mr:
25637 FOp = X86::ADDSDrm;
25638 MOp = X86::MOVSDmr;
25641 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25642 DebugLoc DL = MI.getDebugLoc();
25643 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25644 unsigned ValOpIdx = X86::AddrNumOperands;
25645 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25646 MachineInstrBuilder MIB =
25647 BuildMI(*BB, MI, DL, TII->get(FOp),
25648 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25650 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25651 MachineOperand &Operand = MI.getOperand(i);
25652 // Clear any kill flags on register operands as we'll create a second
25653 // instruction using the same address operands.
25654 if (Operand.isReg())
25655 Operand.setIsKill(false);
25658 MachineInstr *FOpMI = MIB;
25659 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25660 for (int i = 0; i < X86::AddrNumOperands; ++i)
25661 MIB.add(MI.getOperand(i));
25662 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25663 MI.eraseFromParent(); // The pseudo instruction is gone now.
25667 MachineBasicBlock *
25668 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25669 MachineBasicBlock *BB) const {
25670 MachineFunction *MF = BB->getParent();
25671 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25672 DebugLoc DL = MI.getDebugLoc();
25673 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25675 assert(MF->shouldSplitStack());
25677 const bool Is64Bit = Subtarget.is64Bit();
25678 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25680 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25681 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25684 // ... [Till the alloca]
25685 // If stacklet is not large enough, jump to mallocMBB
25688 // Allocate by subtracting from RSP
25689 // Jump to continueMBB
25692 // Allocate by call to runtime
25696 // [rest of original BB]
25699 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25700 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25701 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25703 MachineRegisterInfo &MRI = MF->getRegInfo();
25704 const TargetRegisterClass *AddrRegClass =
25705 getRegClassFor(getPointerTy(MF->getDataLayout()));
25707 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25708 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25709 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25710 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25711 sizeVReg = MI.getOperand(1).getReg(),
25713 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25715 MachineFunction::iterator MBBIter = ++BB->getIterator();
25717 MF->insert(MBBIter, bumpMBB);
25718 MF->insert(MBBIter, mallocMBB);
25719 MF->insert(MBBIter, continueMBB);
25721 continueMBB->splice(continueMBB->begin(), BB,
25722 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25723 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25725 // Add code to the main basic block to check if the stack limit has been hit,
25726 // and if so, jump to mallocMBB otherwise to bumpMBB.
25727 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25728 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25729 .addReg(tmpSPVReg).addReg(sizeVReg);
25730 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25731 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25732 .addReg(SPLimitVReg);
25733 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25735 // bumpMBB simply decreases the stack pointer, since we know the current
25736 // stacklet has enough space.
25737 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25738 .addReg(SPLimitVReg);
25739 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25740 .addReg(SPLimitVReg);
25741 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25743 // Calls into a routine in libgcc to allocate more space from the heap.
25744 const uint32_t *RegMask =
25745 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25747 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25749 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25750 .addExternalSymbol("__morestack_allocate_stack_space")
25751 .addRegMask(RegMask)
25752 .addReg(X86::RDI, RegState::Implicit)
25753 .addReg(X86::RAX, RegState::ImplicitDefine);
25754 } else if (Is64Bit) {
25755 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25757 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25758 .addExternalSymbol("__morestack_allocate_stack_space")
25759 .addRegMask(RegMask)
25760 .addReg(X86::EDI, RegState::Implicit)
25761 .addReg(X86::EAX, RegState::ImplicitDefine);
25763 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25765 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25766 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25767 .addExternalSymbol("__morestack_allocate_stack_space")
25768 .addRegMask(RegMask)
25769 .addReg(X86::EAX, RegState::ImplicitDefine);
25773 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25776 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25777 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25778 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25780 // Set up the CFG correctly.
25781 BB->addSuccessor(bumpMBB);
25782 BB->addSuccessor(mallocMBB);
25783 mallocMBB->addSuccessor(continueMBB);
25784 bumpMBB->addSuccessor(continueMBB);
25786 // Take care of the PHI nodes.
25787 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25788 MI.getOperand(0).getReg())
25789 .addReg(mallocPtrVReg)
25791 .addReg(bumpSPPtrVReg)
25794 // Delete the original pseudo instruction.
25795 MI.eraseFromParent();
25798 return continueMBB;
25801 MachineBasicBlock *
25802 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25803 MachineBasicBlock *BB) const {
25804 MachineFunction *MF = BB->getParent();
25805 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25806 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25807 DebugLoc DL = MI.getDebugLoc();
25809 assert(!isAsynchronousEHPersonality(
25810 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25811 "SEH does not use catchret!");
25813 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25814 if (!Subtarget.is32Bit())
25817 // C++ EH creates a new target block to hold the restore code, and wires up
25818 // the new block to the return destination with a normal JMP_4.
25819 MachineBasicBlock *RestoreMBB =
25820 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25821 assert(BB->succ_size() == 1);
25822 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25823 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25824 BB->addSuccessor(RestoreMBB);
25825 MI.getOperand(0).setMBB(RestoreMBB);
25827 auto RestoreMBBI = RestoreMBB->begin();
25828 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25829 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25833 MachineBasicBlock *
25834 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25835 MachineBasicBlock *BB) const {
25836 MachineFunction *MF = BB->getParent();
25837 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25838 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25839 // Only 32-bit SEH requires special handling for catchpad.
25840 if (IsSEH && Subtarget.is32Bit()) {
25841 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25842 DebugLoc DL = MI.getDebugLoc();
25843 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25845 MI.eraseFromParent();
25849 MachineBasicBlock *
25850 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25851 MachineBasicBlock *BB) const {
25852 // So, here we replace TLSADDR with the sequence:
25853 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25854 // We need this because TLSADDR is lowered into calls
25855 // inside MC, therefore without the two markers shrink-wrapping
25856 // may push the prologue/epilogue pass them.
25857 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25858 DebugLoc DL = MI.getDebugLoc();
25859 MachineFunction &MF = *BB->getParent();
25861 // Emit CALLSEQ_START right before the instruction.
25862 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25863 MachineInstrBuilder CallseqStart =
25864 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25865 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25867 // Emit CALLSEQ_END right after the instruction.
25868 // We don't call erase from parent because we want to keep the
25869 // original instruction around.
25870 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25871 MachineInstrBuilder CallseqEnd =
25872 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25873 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25878 MachineBasicBlock *
25879 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25880 MachineBasicBlock *BB) const {
25881 // This is pretty easy. We're taking the value that we received from
25882 // our load from the relocation, sticking it in either RDI (x86-64)
25883 // or EAX and doing an indirect call. The return value will then
25884 // be in the normal return register.
25885 MachineFunction *F = BB->getParent();
25886 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25887 DebugLoc DL = MI.getDebugLoc();
25889 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25890 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25892 // Get a register mask for the lowered call.
25893 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25894 // proper register mask.
25895 const uint32_t *RegMask =
25896 Subtarget.is64Bit() ?
25897 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25898 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25899 if (Subtarget.is64Bit()) {
25900 MachineInstrBuilder MIB =
25901 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25905 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25906 MI.getOperand(3).getTargetFlags())
25908 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25909 addDirectMem(MIB, X86::RDI);
25910 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25911 } else if (!isPositionIndependent()) {
25912 MachineInstrBuilder MIB =
25913 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25917 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25918 MI.getOperand(3).getTargetFlags())
25920 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25921 addDirectMem(MIB, X86::EAX);
25922 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25924 MachineInstrBuilder MIB =
25925 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25926 .addReg(TII->getGlobalBaseReg(F))
25929 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25930 MI.getOperand(3).getTargetFlags())
25932 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25933 addDirectMem(MIB, X86::EAX);
25934 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25937 MI.eraseFromParent(); // The pseudo instruction is gone now.
25941 MachineBasicBlock *
25942 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25943 MachineBasicBlock *MBB) const {
25944 DebugLoc DL = MI.getDebugLoc();
25945 MachineFunction *MF = MBB->getParent();
25946 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25947 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25948 MachineRegisterInfo &MRI = MF->getRegInfo();
25950 const BasicBlock *BB = MBB->getBasicBlock();
25951 MachineFunction::iterator I = ++MBB->getIterator();
25953 // Memory Reference
25954 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25955 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25958 unsigned MemOpndSlot = 0;
25960 unsigned CurOp = 0;
25962 DstReg = MI.getOperand(CurOp++).getReg();
25963 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25964 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25966 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25967 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25969 MemOpndSlot = CurOp;
25971 MVT PVT = getPointerTy(MF->getDataLayout());
25972 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25973 "Invalid Pointer Size!");
25975 // For v = setjmp(buf), we generate
25978 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25979 // SjLjSetup restoreMBB
25985 // v = phi(main, restore)
25988 // if base pointer being used, load it from frame
25991 MachineBasicBlock *thisMBB = MBB;
25992 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25993 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25994 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25995 MF->insert(I, mainMBB);
25996 MF->insert(I, sinkMBB);
25997 MF->push_back(restoreMBB);
25998 restoreMBB->setHasAddressTaken();
26000 MachineInstrBuilder MIB;
26002 // Transfer the remainder of BB and its successor edges to sinkMBB.
26003 sinkMBB->splice(sinkMBB->begin(), MBB,
26004 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26005 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26008 unsigned PtrStoreOpc = 0;
26009 unsigned LabelReg = 0;
26010 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26011 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26012 !isPositionIndependent();
26014 // Prepare IP either in reg or imm.
26015 if (!UseImmLabel) {
26016 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26017 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26018 LabelReg = MRI.createVirtualRegister(PtrRC);
26019 if (Subtarget.is64Bit()) {
26020 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26024 .addMBB(restoreMBB)
26027 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26028 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26029 .addReg(XII->getGlobalBaseReg(MF))
26032 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26036 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26038 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26039 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26040 if (i == X86::AddrDisp)
26041 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26043 MIB.add(MI.getOperand(MemOpndSlot + i));
26046 MIB.addReg(LabelReg);
26048 MIB.addMBB(restoreMBB);
26049 MIB.setMemRefs(MMOBegin, MMOEnd);
26051 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26052 .addMBB(restoreMBB);
26054 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26055 MIB.addRegMask(RegInfo->getNoPreservedMask());
26056 thisMBB->addSuccessor(mainMBB);
26057 thisMBB->addSuccessor(restoreMBB);
26061 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26062 mainMBB->addSuccessor(sinkMBB);
26065 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26066 TII->get(X86::PHI), DstReg)
26067 .addReg(mainDstReg).addMBB(mainMBB)
26068 .addReg(restoreDstReg).addMBB(restoreMBB);
26071 if (RegInfo->hasBasePointer(*MF)) {
26072 const bool Uses64BitFramePtr =
26073 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26074 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26075 X86FI->setRestoreBasePointer(MF);
26076 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26077 unsigned BasePtr = RegInfo->getBaseRegister();
26078 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26079 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26080 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26081 .setMIFlag(MachineInstr::FrameSetup);
26083 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26084 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26085 restoreMBB->addSuccessor(sinkMBB);
26087 MI.eraseFromParent();
26091 MachineBasicBlock *
26092 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26093 MachineBasicBlock *MBB) const {
26094 DebugLoc DL = MI.getDebugLoc();
26095 MachineFunction *MF = MBB->getParent();
26096 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26097 MachineRegisterInfo &MRI = MF->getRegInfo();
26099 // Memory Reference
26100 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26101 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26103 MVT PVT = getPointerTy(MF->getDataLayout());
26104 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26105 "Invalid Pointer Size!");
26107 const TargetRegisterClass *RC =
26108 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26109 unsigned Tmp = MRI.createVirtualRegister(RC);
26110 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26111 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26112 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26113 unsigned SP = RegInfo->getStackRegister();
26115 MachineInstrBuilder MIB;
26117 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26118 const int64_t SPOffset = 2 * PVT.getStoreSize();
26120 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26121 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26124 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26125 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26126 MIB.add(MI.getOperand(i));
26127 MIB.setMemRefs(MMOBegin, MMOEnd);
26129 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26130 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26131 if (i == X86::AddrDisp)
26132 MIB.addDisp(MI.getOperand(i), LabelOffset);
26134 MIB.add(MI.getOperand(i));
26136 MIB.setMemRefs(MMOBegin, MMOEnd);
26138 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26139 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26140 if (i == X86::AddrDisp)
26141 MIB.addDisp(MI.getOperand(i), SPOffset);
26143 MIB.add(MI.getOperand(i));
26145 MIB.setMemRefs(MMOBegin, MMOEnd);
26147 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26149 MI.eraseFromParent();
26153 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26154 MachineBasicBlock *MBB,
26155 MachineBasicBlock *DispatchBB,
26157 DebugLoc DL = MI.getDebugLoc();
26158 MachineFunction *MF = MBB->getParent();
26159 MachineRegisterInfo *MRI = &MF->getRegInfo();
26160 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26162 MVT PVT = getPointerTy(MF->getDataLayout());
26163 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26168 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26169 !isPositionIndependent();
26172 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26174 const TargetRegisterClass *TRC =
26175 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26176 VR = MRI->createVirtualRegister(TRC);
26177 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26179 if (Subtarget.is64Bit())
26180 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26184 .addMBB(DispatchBB)
26187 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26188 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26191 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26195 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26196 addFrameReference(MIB, FI, 36);
26198 MIB.addMBB(DispatchBB);
26203 MachineBasicBlock *
26204 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26205 MachineBasicBlock *BB) const {
26206 DebugLoc DL = MI.getDebugLoc();
26207 MachineFunction *MF = BB->getParent();
26208 MachineFrameInfo &MFI = MF->getFrameInfo();
26209 MachineRegisterInfo *MRI = &MF->getRegInfo();
26210 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26211 int FI = MFI.getFunctionContextIndex();
26213 // Get a mapping of the call site numbers to all of the landing pads they're
26214 // associated with.
26215 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26216 unsigned MaxCSNum = 0;
26217 for (auto &MBB : *MF) {
26218 if (!MBB.isEHPad())
26221 MCSymbol *Sym = nullptr;
26222 for (const auto &MI : MBB) {
26223 if (MI.isDebugValue())
26226 assert(MI.isEHLabel() && "expected EH_LABEL");
26227 Sym = MI.getOperand(0).getMCSymbol();
26231 if (!MF->hasCallSiteLandingPad(Sym))
26234 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26235 CallSiteNumToLPad[CSI].push_back(&MBB);
26236 MaxCSNum = std::max(MaxCSNum, CSI);
26240 // Get an ordered list of the machine basic blocks for the jump table.
26241 std::vector<MachineBasicBlock *> LPadList;
26242 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26243 LPadList.reserve(CallSiteNumToLPad.size());
26245 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26246 for (auto &LP : CallSiteNumToLPad[CSI]) {
26247 LPadList.push_back(LP);
26248 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26252 assert(!LPadList.empty() &&
26253 "No landing pad destinations for the dispatch jump table!");
26255 // Create the MBBs for the dispatch code.
26257 // Shove the dispatch's address into the return slot in the function context.
26258 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26259 DispatchBB->setIsEHPad(true);
26261 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26262 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26263 DispatchBB->addSuccessor(TrapBB);
26265 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26266 DispatchBB->addSuccessor(DispContBB);
26269 MF->push_back(DispatchBB);
26270 MF->push_back(DispContBB);
26271 MF->push_back(TrapBB);
26273 // Insert code into the entry block that creates and registers the function
26275 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26277 // Create the jump table and associated information
26278 MachineJumpTableInfo *JTI =
26279 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26280 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26282 const X86RegisterInfo &RI = TII->getRegisterInfo();
26283 // Add a register mask with no preserved registers. This results in all
26284 // registers being marked as clobbered.
26285 if (RI.hasBasePointer(*MF)) {
26286 const bool FPIs64Bit =
26287 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26288 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26289 MFI->setRestoreBasePointer(MF);
26291 unsigned FP = RI.getFrameRegister(*MF);
26292 unsigned BP = RI.getBaseRegister();
26293 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26294 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26295 MFI->getRestoreBasePointerOffset())
26296 .addRegMask(RI.getNoPreservedMask());
26298 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26299 .addRegMask(RI.getNoPreservedMask());
26302 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26303 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26305 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26307 .addImm(LPadList.size());
26308 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26310 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26311 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26314 BuildMI(DispContBB, DL,
26315 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26317 .addImm(Subtarget.is64Bit() ? 8 : 4)
26319 .addJumpTableIndex(MJTI)
26322 // Add the jump table entries as successors to the MBB.
26323 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26324 for (auto &LP : LPadList)
26325 if (SeenMBBs.insert(LP).second)
26326 DispContBB->addSuccessor(LP);
26328 // N.B. the order the invoke BBs are processed in doesn't matter here.
26329 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26330 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26331 for (MachineBasicBlock *MBB : InvokeBBs) {
26332 // Remove the landing pad successor from the invoke block and replace it
26333 // with the new dispatch block.
26334 // Keep a copy of Successors since it's modified inside the loop.
26335 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26337 // FIXME: Avoid quadratic complexity.
26338 for (auto MBBS : Successors) {
26339 if (MBBS->isEHPad()) {
26340 MBB->removeSuccessor(MBBS);
26341 MBBLPads.push_back(MBBS);
26345 MBB->addSuccessor(DispatchBB);
26347 // Find the invoke call and mark all of the callee-saved registers as
26348 // 'implicit defined' so that they're spilled. This prevents code from
26349 // moving instructions to before the EH block, where they will never be
26351 for (auto &II : reverse(*MBB)) {
26355 DenseMap<unsigned, bool> DefRegs;
26356 for (auto &MOp : II.operands())
26358 DefRegs[MOp.getReg()] = true;
26360 MachineInstrBuilder MIB(*MF, &II);
26361 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26362 unsigned Reg = SavedRegs[RI];
26364 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26371 // Mark all former landing pads as non-landing pads. The dispatch is the only
26372 // landing pad now.
26373 for (auto &LP : MBBLPads)
26374 LP->setIsEHPad(false);
26376 // The instruction is gone now.
26377 MI.eraseFromParent();
26381 MachineBasicBlock *
26382 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26383 MachineBasicBlock *BB) const {
26384 MachineFunction *MF = BB->getParent();
26385 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26386 DebugLoc DL = MI.getDebugLoc();
26388 switch (MI.getOpcode()) {
26389 default: llvm_unreachable("Unexpected instr type to insert");
26390 case X86::TAILJMPd64:
26391 case X86::TAILJMPr64:
26392 case X86::TAILJMPm64:
26393 case X86::TAILJMPr64_REX:
26394 case X86::TAILJMPm64_REX:
26395 llvm_unreachable("TAILJMP64 would not be touched here.");
26396 case X86::TCRETURNdi64:
26397 case X86::TCRETURNri64:
26398 case X86::TCRETURNmi64:
26400 case X86::TLS_addr32:
26401 case X86::TLS_addr64:
26402 case X86::TLS_base_addr32:
26403 case X86::TLS_base_addr64:
26404 return EmitLoweredTLSAddr(MI, BB);
26405 case X86::CATCHRET:
26406 return EmitLoweredCatchRet(MI, BB);
26407 case X86::CATCHPAD:
26408 return EmitLoweredCatchPad(MI, BB);
26409 case X86::SEG_ALLOCA_32:
26410 case X86::SEG_ALLOCA_64:
26411 return EmitLoweredSegAlloca(MI, BB);
26412 case X86::TLSCall_32:
26413 case X86::TLSCall_64:
26414 return EmitLoweredTLSCall(MI, BB);
26415 case X86::CMOV_FR32:
26416 case X86::CMOV_FR64:
26417 case X86::CMOV_FR128:
26418 case X86::CMOV_GR8:
26419 case X86::CMOV_GR16:
26420 case X86::CMOV_GR32:
26421 case X86::CMOV_RFP32:
26422 case X86::CMOV_RFP64:
26423 case X86::CMOV_RFP80:
26424 case X86::CMOV_V2F64:
26425 case X86::CMOV_V2I64:
26426 case X86::CMOV_V4F32:
26427 case X86::CMOV_V4F64:
26428 case X86::CMOV_V4I64:
26429 case X86::CMOV_V16F32:
26430 case X86::CMOV_V8F32:
26431 case X86::CMOV_V8F64:
26432 case X86::CMOV_V8I64:
26433 case X86::CMOV_V8I1:
26434 case X86::CMOV_V16I1:
26435 case X86::CMOV_V32I1:
26436 case X86::CMOV_V64I1:
26437 return EmitLoweredSelect(MI, BB);
26439 case X86::RDFLAGS32:
26440 case X86::RDFLAGS64: {
26442 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26443 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26444 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26445 // Permit reads of the FLAGS register without it being defined.
26446 // This intrinsic exists to read external processor state in flags, such as
26447 // the trap flag, interrupt flag, and direction flag, none of which are
26448 // modeled by the backend.
26449 Push->getOperand(2).setIsUndef();
26450 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26452 MI.eraseFromParent(); // The pseudo is gone now.
26456 case X86::WRFLAGS32:
26457 case X86::WRFLAGS64: {
26459 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26461 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26462 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26463 BuildMI(*BB, MI, DL, TII->get(PopF));
26465 MI.eraseFromParent(); // The pseudo is gone now.
26469 case X86::RELEASE_FADD32mr:
26470 case X86::RELEASE_FADD64mr:
26471 return EmitLoweredAtomicFP(MI, BB);
26473 case X86::FP32_TO_INT16_IN_MEM:
26474 case X86::FP32_TO_INT32_IN_MEM:
26475 case X86::FP32_TO_INT64_IN_MEM:
26476 case X86::FP64_TO_INT16_IN_MEM:
26477 case X86::FP64_TO_INT32_IN_MEM:
26478 case X86::FP64_TO_INT64_IN_MEM:
26479 case X86::FP80_TO_INT16_IN_MEM:
26480 case X86::FP80_TO_INT32_IN_MEM:
26481 case X86::FP80_TO_INT64_IN_MEM: {
26482 // Change the floating point control register to use "round towards zero"
26483 // mode when truncating to an integer value.
26484 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26485 addFrameReference(BuildMI(*BB, MI, DL,
26486 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26488 // Load the old value of the high byte of the control word...
26490 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26491 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26494 // Set the high part to be round to zero...
26495 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26498 // Reload the modified control word now...
26499 addFrameReference(BuildMI(*BB, MI, DL,
26500 TII->get(X86::FLDCW16m)), CWFrameIdx);
26502 // Restore the memory image of control word to original value
26503 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26506 // Get the X86 opcode to use.
26508 switch (MI.getOpcode()) {
26509 default: llvm_unreachable("illegal opcode!");
26510 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26511 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26512 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26513 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26514 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26515 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26516 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26517 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26518 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26521 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26522 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26523 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26525 // Reload the original control word now.
26526 addFrameReference(BuildMI(*BB, MI, DL,
26527 TII->get(X86::FLDCW16m)), CWFrameIdx);
26529 MI.eraseFromParent(); // The pseudo instruction is gone now.
26532 // String/text processing lowering.
26533 case X86::PCMPISTRM128REG:
26534 case X86::VPCMPISTRM128REG:
26535 case X86::PCMPISTRM128MEM:
26536 case X86::VPCMPISTRM128MEM:
26537 case X86::PCMPESTRM128REG:
26538 case X86::VPCMPESTRM128REG:
26539 case X86::PCMPESTRM128MEM:
26540 case X86::VPCMPESTRM128MEM:
26541 assert(Subtarget.hasSSE42() &&
26542 "Target must have SSE4.2 or AVX features enabled");
26543 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26545 // String/text processing lowering.
26546 case X86::PCMPISTRIREG:
26547 case X86::VPCMPISTRIREG:
26548 case X86::PCMPISTRIMEM:
26549 case X86::VPCMPISTRIMEM:
26550 case X86::PCMPESTRIREG:
26551 case X86::VPCMPESTRIREG:
26552 case X86::PCMPESTRIMEM:
26553 case X86::VPCMPESTRIMEM:
26554 assert(Subtarget.hasSSE42() &&
26555 "Target must have SSE4.2 or AVX features enabled");
26556 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26558 // Thread synchronization.
26560 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26561 case X86::MONITORX:
26562 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26566 return emitClzero(&MI, BB, Subtarget);
26570 return emitWRPKRU(MI, BB, Subtarget);
26572 return emitRDPKRU(MI, BB, Subtarget);
26575 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26577 case X86::VASTART_SAVE_XMM_REGS:
26578 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26580 case X86::VAARG_64:
26581 return EmitVAARG64WithCustomInserter(MI, BB);
26583 case X86::EH_SjLj_SetJmp32:
26584 case X86::EH_SjLj_SetJmp64:
26585 return emitEHSjLjSetJmp(MI, BB);
26587 case X86::EH_SjLj_LongJmp32:
26588 case X86::EH_SjLj_LongJmp64:
26589 return emitEHSjLjLongJmp(MI, BB);
26591 case X86::Int_eh_sjlj_setup_dispatch:
26592 return EmitSjLjDispatchBlock(MI, BB);
26594 case TargetOpcode::STATEPOINT:
26595 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26596 // this point in the process. We diverge later.
26597 return emitPatchPoint(MI, BB);
26599 case TargetOpcode::STACKMAP:
26600 case TargetOpcode::PATCHPOINT:
26601 return emitPatchPoint(MI, BB);
26603 case X86::LCMPXCHG8B: {
26604 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26605 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26606 // requires a memory operand. If it happens that current architecture is
26607 // i686 and for current function we need a base pointer
26608 // - which is ESI for i686 - register allocator would not be able to
26609 // allocate registers for an address in form of X(%reg, %reg, Y)
26610 // - there never would be enough unreserved registers during regalloc
26611 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26612 // We are giving a hand to register allocator by precomputing the address in
26613 // a new vreg using LEA.
26615 // If it is not i686 or there is no base pointer - nothing to do here.
26616 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26619 // Even though this code does not necessarily needs the base pointer to
26620 // be ESI, we check for that. The reason: if this assert fails, there are
26621 // some changes happened in the compiler base pointer handling, which most
26622 // probably have to be addressed somehow here.
26623 assert(TRI->getBaseRegister() == X86::ESI &&
26624 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26625 "base pointer in mind");
26627 MachineRegisterInfo &MRI = MF->getRegInfo();
26628 MVT SPTy = getPointerTy(MF->getDataLayout());
26629 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26630 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26632 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26633 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26634 // does not use index register.
26635 if (AM.IndexReg == X86::NoRegister)
26638 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26639 // four operand definitions that are E[ABCD] registers. We skip them and
26640 // then insert the LEA.
26641 MachineBasicBlock::iterator MBBI(MI);
26642 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26643 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26646 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26648 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26652 case X86::LCMPXCHG16B:
26654 case X86::LCMPXCHG8B_SAVE_EBX:
26655 case X86::LCMPXCHG16B_SAVE_RBX: {
26657 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26658 if (!BB->isLiveIn(BasePtr))
26659 BB->addLiveIn(BasePtr);
26665 //===----------------------------------------------------------------------===//
26666 // X86 Optimization Hooks
26667 //===----------------------------------------------------------------------===//
26669 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26672 const APInt &DemandedElts,
26673 const SelectionDAG &DAG,
26674 unsigned Depth) const {
26675 unsigned BitWidth = KnownZero.getBitWidth();
26676 unsigned Opc = Op.getOpcode();
26677 EVT VT = Op.getValueType();
26678 assert((Opc >= ISD::BUILTIN_OP_END ||
26679 Opc == ISD::INTRINSIC_WO_CHAIN ||
26680 Opc == ISD::INTRINSIC_W_CHAIN ||
26681 Opc == ISD::INTRINSIC_VOID) &&
26682 "Should use MaskedValueIsZero if you don't know whether Op"
26683 " is a target node!");
26685 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
26699 // These nodes' second result is a boolean.
26700 if (Op.getResNo() == 0)
26703 case X86ISD::SETCC:
26704 KnownZero.setBits(1, BitWidth);
26706 case X86ISD::MOVMSK: {
26707 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26708 KnownZero.setBits(NumLoBits, BitWidth);
26711 case X86ISD::VSHLI:
26712 case X86ISD::VSRLI: {
26713 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26714 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26715 KnownZero = APInt::getAllOnesValue(BitWidth);
26719 DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1);
26720 unsigned ShAmt = ShiftImm->getZExtValue();
26721 if (Opc == X86ISD::VSHLI) {
26722 KnownZero = KnownZero << ShAmt;
26723 KnownOne = KnownOne << ShAmt;
26724 // Low bits are known zero.
26725 KnownZero.setLowBits(ShAmt);
26727 KnownZero.lshrInPlace(ShAmt);
26728 KnownOne.lshrInPlace(ShAmt);
26729 // High bits are known zero.
26730 KnownZero.setHighBits(ShAmt);
26735 case X86ISD::VZEXT: {
26736 SDValue N0 = Op.getOperand(0);
26737 unsigned NumElts = VT.getVectorNumElements();
26739 EVT SrcVT = N0.getValueType();
26740 unsigned InNumElts = SrcVT.getVectorNumElements();
26741 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26742 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26744 KnownZero = KnownOne = APInt(InBitWidth, 0);
26745 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26746 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1);
26747 KnownOne = KnownOne.zext(BitWidth);
26748 KnownZero = KnownZero.zext(BitWidth);
26749 KnownZero.setBits(InBitWidth, BitWidth);
26755 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26756 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26757 unsigned Depth) const {
26758 unsigned VTBits = Op.getScalarValueSizeInBits();
26759 unsigned Opcode = Op.getOpcode();
26761 case X86ISD::SETCC_CARRY:
26762 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26765 case X86ISD::VSEXT: {
26766 SDValue Src = Op.getOperand(0);
26767 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26768 Tmp += VTBits - Src.getScalarValueSizeInBits();
26772 case X86ISD::VSRAI: {
26773 SDValue Src = Op.getOperand(0);
26774 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26775 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26777 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26780 case X86ISD::PCMPGT:
26781 case X86ISD::PCMPEQ:
26783 case X86ISD::VPCOM:
26784 case X86ISD::VPCOMU:
26785 // Vector compares return zero/all-bits result values.
26793 /// Returns true (and the GlobalValue and the offset) if the node is a
26794 /// GlobalAddress + offset.
26795 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26796 const GlobalValue* &GA,
26797 int64_t &Offset) const {
26798 if (N->getOpcode() == X86ISD::Wrapper) {
26799 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26800 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26801 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26805 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26808 // Attempt to match a combined shuffle mask against supported unary shuffle
26810 // TODO: Investigate sharing more of this with shuffle lowering.
26811 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26812 bool AllowFloatDomain, bool AllowIntDomain,
26813 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26814 const X86Subtarget &Subtarget,
26815 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26816 unsigned NumMaskElts = Mask.size();
26817 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26819 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26820 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26821 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26822 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26823 unsigned MaxScale = 64 / MaskEltSize;
26824 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26826 unsigned NumDstElts = NumMaskElts / Scale;
26827 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26828 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26829 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26832 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26833 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26834 if (SrcVT != MaskVT)
26835 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26836 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26837 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26838 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26839 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26845 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26846 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26847 isUndefOrEqual(Mask[0], 0) &&
26848 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26849 Shuffle = X86ISD::VZEXT_MOVL;
26850 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26854 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26855 // instructions are no slower than UNPCKLPD but has the option to
26856 // fold the input operand into even an unaligned memory load.
26857 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26858 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26859 Shuffle = X86ISD::MOVDDUP;
26860 SrcVT = DstVT = MVT::v2f64;
26863 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26864 Shuffle = X86ISD::MOVSLDUP;
26865 SrcVT = DstVT = MVT::v4f32;
26868 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26869 Shuffle = X86ISD::MOVSHDUP;
26870 SrcVT = DstVT = MVT::v4f32;
26875 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26876 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26877 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26878 Shuffle = X86ISD::MOVDDUP;
26879 SrcVT = DstVT = MVT::v4f64;
26882 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26883 Shuffle = X86ISD::MOVSLDUP;
26884 SrcVT = DstVT = MVT::v8f32;
26887 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26888 Shuffle = X86ISD::MOVSHDUP;
26889 SrcVT = DstVT = MVT::v8f32;
26894 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26895 assert(Subtarget.hasAVX512() &&
26896 "AVX512 required for 512-bit vector shuffles");
26897 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26898 Shuffle = X86ISD::MOVDDUP;
26899 SrcVT = DstVT = MVT::v8f64;
26902 if (isTargetShuffleEquivalent(
26903 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26904 Shuffle = X86ISD::MOVSLDUP;
26905 SrcVT = DstVT = MVT::v16f32;
26908 if (isTargetShuffleEquivalent(
26909 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26910 Shuffle = X86ISD::MOVSHDUP;
26911 SrcVT = DstVT = MVT::v16f32;
26916 // Attempt to match against broadcast-from-vector.
26917 if (Subtarget.hasAVX2()) {
26918 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26919 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26920 SrcVT = DstVT = MaskVT;
26921 Shuffle = X86ISD::VBROADCAST;
26929 // Attempt to match a combined shuffle mask against supported unary immediate
26930 // permute instructions.
26931 // TODO: Investigate sharing more of this with shuffle lowering.
26932 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26933 bool AllowFloatDomain,
26934 bool AllowIntDomain,
26935 const X86Subtarget &Subtarget,
26936 unsigned &Shuffle, MVT &ShuffleVT,
26937 unsigned &PermuteImm) {
26938 unsigned NumMaskElts = Mask.size();
26940 bool ContainsZeros = false;
26941 APInt Zeroable(NumMaskElts, false);
26942 for (unsigned i = 0; i != NumMaskElts; ++i) {
26944 if (isUndefOrZero(M))
26945 Zeroable.setBit(i);
26946 ContainsZeros |= (M == SM_SentinelZero);
26949 // Attempt to match against byte/bit shifts.
26950 // FIXME: Add 512-bit support.
26951 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26952 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26953 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26954 MaskVT.getScalarSizeInBits(), Mask,
26955 0, Zeroable, Subtarget);
26956 if (0 < ShiftAmt) {
26957 PermuteImm = (unsigned)ShiftAmt;
26962 // Ensure we don't contain any zero elements.
26966 assert(llvm::all_of(Mask, [&](int M) {
26967 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26968 }) && "Expected unary shuffle");
26970 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26971 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26972 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26974 // Handle PSHUFLW/PSHUFHW repeated patterns.
26975 if (MaskScalarSizeInBits == 16) {
26976 SmallVector<int, 4> RepeatedMask;
26977 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26978 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26979 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26981 // PSHUFLW: permute lower 4 elements only.
26982 if (isUndefOrInRange(LoMask, 0, 4) &&
26983 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26984 Shuffle = X86ISD::PSHUFLW;
26985 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26986 PermuteImm = getV4X86ShuffleImm(LoMask);
26990 // PSHUFHW: permute upper 4 elements only.
26991 if (isUndefOrInRange(HiMask, 4, 8) &&
26992 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26993 // Offset the HiMask so that we can create the shuffle immediate.
26994 int OffsetHiMask[4];
26995 for (int i = 0; i != 4; ++i)
26996 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26998 Shuffle = X86ISD::PSHUFHW;
26999 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27000 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27009 // We only support permutation of 32/64 bit elements after this.
27010 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
27013 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27014 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27015 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
27018 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
27019 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
27020 AllowFloatDomain = true;
27021 AllowIntDomain = false;
27024 // Check for lane crossing permutes.
27025 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27026 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27027 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
27028 Shuffle = X86ISD::VPERMI;
27029 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27030 PermuteImm = getV4X86ShuffleImm(Mask);
27033 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
27034 SmallVector<int, 4> RepeatedMask;
27035 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27036 Shuffle = X86ISD::VPERMI;
27037 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27038 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27045 // VPERMILPD can permute with a non-repeating shuffle.
27046 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27047 Shuffle = X86ISD::VPERMILPI;
27048 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27050 for (int i = 0, e = Mask.size(); i != e; ++i) {
27052 if (M == SM_SentinelUndef)
27054 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27055 PermuteImm |= (M & 1) << i;
27060 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27061 SmallVector<int, 4> RepeatedMask;
27062 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27065 // Narrow the repeated mask for 32-bit element permutes.
27066 SmallVector<int, 4> WordMask = RepeatedMask;
27067 if (MaskScalarSizeInBits == 64)
27068 scaleShuffleMask(2, RepeatedMask, WordMask);
27070 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27071 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27072 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27073 PermuteImm = getV4X86ShuffleImm(WordMask);
27077 // Attempt to match a combined unary shuffle mask against supported binary
27078 // shuffle instructions.
27079 // TODO: Investigate sharing more of this with shuffle lowering.
27080 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27081 bool AllowFloatDomain, bool AllowIntDomain,
27082 SDValue &V1, SDValue &V2, SDLoc &DL,
27084 const X86Subtarget &Subtarget,
27085 unsigned &Shuffle, MVT &ShuffleVT,
27087 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27089 if (MaskVT.is128BitVector()) {
27090 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27092 Shuffle = X86ISD::MOVLHPS;
27093 ShuffleVT = MVT::v4f32;
27096 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27098 Shuffle = X86ISD::MOVHLPS;
27099 ShuffleVT = MVT::v4f32;
27102 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27103 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27105 Shuffle = X86ISD::MOVSD;
27106 ShuffleVT = MaskVT;
27109 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27110 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27111 Shuffle = X86ISD::MOVSS;
27112 ShuffleVT = MaskVT;
27117 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27118 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27119 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27120 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27121 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27122 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27123 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27125 ShuffleVT = MaskVT;
27126 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27127 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27135 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27136 bool AllowFloatDomain,
27137 bool AllowIntDomain,
27138 SDValue &V1, SDValue &V2, SDLoc &DL,
27140 const X86Subtarget &Subtarget,
27141 unsigned &Shuffle, MVT &ShuffleVT,
27142 unsigned &PermuteImm) {
27143 unsigned NumMaskElts = Mask.size();
27144 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27146 // Attempt to match against PALIGNR byte rotate.
27147 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27148 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27149 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27150 if (0 < ByteRotation) {
27151 Shuffle = X86ISD::PALIGNR;
27152 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27153 PermuteImm = ByteRotation;
27158 // Attempt to combine to X86ISD::BLENDI.
27159 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27160 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27161 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27162 uint64_t BlendMask = 0;
27163 bool ForceV1Zero = false, ForceV2Zero = false;
27164 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27165 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27167 if (MaskVT == MVT::v16i16) {
27168 // We can only use v16i16 PBLENDW if the lanes are repeated.
27169 SmallVector<int, 8> RepeatedMask;
27170 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27172 assert(RepeatedMask.size() == 8 &&
27173 "Repeated mask size doesn't match!");
27175 for (int i = 0; i < 8; ++i)
27176 if (RepeatedMask[i] >= 8)
27177 PermuteImm |= 1 << i;
27178 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27179 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27180 Shuffle = X86ISD::BLENDI;
27181 ShuffleVT = MaskVT;
27185 // Determine a type compatible with X86ISD::BLENDI.
27186 ShuffleVT = MaskVT;
27187 if (Subtarget.hasAVX2()) {
27188 if (ShuffleVT == MVT::v4i64)
27189 ShuffleVT = MVT::v8i32;
27190 else if (ShuffleVT == MVT::v2i64)
27191 ShuffleVT = MVT::v4i32;
27193 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27194 ShuffleVT = MVT::v8i16;
27195 else if (ShuffleVT == MVT::v4i64)
27196 ShuffleVT = MVT::v4f64;
27197 else if (ShuffleVT == MVT::v8i32)
27198 ShuffleVT = MVT::v8f32;
27201 if (!ShuffleVT.isFloatingPoint()) {
27202 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27204 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27205 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27206 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27209 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27210 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27211 PermuteImm = (unsigned)BlendMask;
27212 Shuffle = X86ISD::BLENDI;
27218 // Attempt to combine to INSERTPS.
27219 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27220 MaskVT.is128BitVector()) {
27221 APInt Zeroable(4, 0);
27222 for (unsigned i = 0; i != NumMaskElts; ++i)
27224 Zeroable.setBit(i);
27226 if (Zeroable.getBoolValue() &&
27227 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27228 Shuffle = X86ISD::INSERTPS;
27229 ShuffleVT = MVT::v4f32;
27234 // Attempt to combine to SHUFPD.
27235 if (AllowFloatDomain && EltSizeInBits == 64 &&
27236 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27237 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27238 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27239 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27240 Shuffle = X86ISD::SHUFP;
27241 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27246 // Attempt to combine to SHUFPS.
27247 if (AllowFloatDomain && EltSizeInBits == 32 &&
27248 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27249 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27250 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27251 SmallVector<int, 4> RepeatedMask;
27252 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27253 // Match each half of the repeated mask, to determine if its just
27254 // referencing one of the vectors, is zeroable or entirely undef.
27255 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27256 int M0 = RepeatedMask[Offset];
27257 int M1 = RepeatedMask[Offset + 1];
27259 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27260 return DAG.getUNDEF(MaskVT);
27261 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27262 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27263 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27264 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27265 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27266 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27267 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27269 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27270 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27271 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27278 int ShufMask[4] = {-1, -1, -1, -1};
27279 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27280 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27285 Shuffle = X86ISD::SHUFP;
27286 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27287 PermuteImm = getV4X86ShuffleImm(ShufMask);
27296 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27299 /// This is the leaf of the recursive combine below. When we have found some
27300 /// chain of single-use x86 shuffle instructions and accumulated the combined
27301 /// shuffle mask represented by them, this will try to pattern match that mask
27302 /// into either a single instruction if there is a special purpose instruction
27303 /// for this operation, or into a PSHUFB instruction which is a fully general
27304 /// instruction but should only be used to replace chains over a certain depth.
27305 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27306 ArrayRef<int> BaseMask, int Depth,
27307 bool HasVariableMask, SelectionDAG &DAG,
27308 TargetLowering::DAGCombinerInfo &DCI,
27309 const X86Subtarget &Subtarget) {
27310 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27311 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27312 "Unexpected number of shuffle inputs!");
27314 // Find the inputs that enter the chain. Note that multiple uses are OK
27315 // here, we're not going to remove the operands we find.
27316 bool UnaryShuffle = (Inputs.size() == 1);
27317 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27318 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27319 : peekThroughBitcasts(Inputs[1]));
27321 MVT VT1 = V1.getSimpleValueType();
27322 MVT VT2 = V2.getSimpleValueType();
27323 MVT RootVT = Root.getSimpleValueType();
27324 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27325 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27326 "Vector size mismatch");
27331 unsigned NumBaseMaskElts = BaseMask.size();
27332 if (NumBaseMaskElts == 1) {
27333 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27334 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27339 unsigned RootSizeInBits = RootVT.getSizeInBits();
27340 unsigned NumRootElts = RootVT.getVectorNumElements();
27341 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27342 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27343 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27345 // Don't combine if we are a AVX512/EVEX target and the mask element size
27346 // is different from the root element size - this would prevent writemasks
27347 // from being reused.
27348 // TODO - this currently prevents all lane shuffles from occurring.
27349 // TODO - check for writemasks usage instead of always preventing combining.
27350 // TODO - attempt to narrow Mask back to writemask size.
27351 bool IsEVEXShuffle =
27352 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27353 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27356 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27358 // Handle 128-bit lane shuffles of 256-bit vectors.
27359 // TODO - this should support binary shuffles.
27360 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27361 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27362 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27363 return false; // Nothing to do!
27364 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27365 unsigned PermMask = 0;
27366 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27367 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27369 Res = DAG.getBitcast(ShuffleVT, V1);
27370 DCI.AddToWorklist(Res.getNode());
27371 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27372 DAG.getUNDEF(ShuffleVT),
27373 DAG.getConstant(PermMask, DL, MVT::i8));
27374 DCI.AddToWorklist(Res.getNode());
27375 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27380 // For masks that have been widened to 128-bit elements or more,
27381 // narrow back down to 64-bit elements.
27382 SmallVector<int, 64> Mask;
27383 if (BaseMaskEltSizeInBits > 64) {
27384 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27385 int MaskScale = BaseMaskEltSizeInBits / 64;
27386 scaleShuffleMask(MaskScale, BaseMask, Mask);
27388 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27391 unsigned NumMaskElts = Mask.size();
27392 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27394 // Determine the effective mask value type.
27395 FloatDomain &= (32 <= MaskEltSizeInBits);
27396 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27397 : MVT::getIntegerVT(MaskEltSizeInBits);
27398 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27400 // Only allow legal mask types.
27401 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27404 // Attempt to match the mask against known shuffle patterns.
27405 MVT ShuffleSrcVT, ShuffleVT;
27406 unsigned Shuffle, PermuteImm;
27408 // Which shuffle domains are permitted?
27409 // Permit domain crossing at higher combine depths.
27410 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27411 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27413 if (UnaryShuffle) {
27414 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27415 // directly if we don't shuffle the lower element and we shuffle the upper
27416 // (zero) elements within themselves.
27417 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27418 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27419 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27420 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27421 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27422 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27423 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27429 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27430 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27432 if (Depth == 1 && Root.getOpcode() == Shuffle)
27433 return false; // Nothing to do!
27434 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27435 return false; // AVX512 Writemask clash.
27436 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27437 DCI.AddToWorklist(Res.getNode());
27438 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27439 DCI.AddToWorklist(Res.getNode());
27440 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27445 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27446 AllowIntDomain, Subtarget, Shuffle,
27447 ShuffleVT, PermuteImm)) {
27448 if (Depth == 1 && Root.getOpcode() == Shuffle)
27449 return false; // Nothing to do!
27450 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27451 return false; // AVX512 Writemask clash.
27452 Res = DAG.getBitcast(ShuffleVT, V1);
27453 DCI.AddToWorklist(Res.getNode());
27454 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27455 DAG.getConstant(PermuteImm, DL, MVT::i8));
27456 DCI.AddToWorklist(Res.getNode());
27457 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27463 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27464 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27466 if (Depth == 1 && Root.getOpcode() == Shuffle)
27467 return false; // Nothing to do!
27468 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27469 return false; // AVX512 Writemask clash.
27470 V1 = DAG.getBitcast(ShuffleVT, V1);
27471 DCI.AddToWorklist(V1.getNode());
27472 V2 = DAG.getBitcast(ShuffleVT, V2);
27473 DCI.AddToWorklist(V2.getNode());
27474 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27475 DCI.AddToWorklist(Res.getNode());
27476 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27481 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27482 AllowIntDomain, V1, V2, DL, DAG,
27483 Subtarget, Shuffle, ShuffleVT,
27485 if (Depth == 1 && Root.getOpcode() == Shuffle)
27486 return false; // Nothing to do!
27487 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27488 return false; // AVX512 Writemask clash.
27489 V1 = DAG.getBitcast(ShuffleVT, V1);
27490 DCI.AddToWorklist(V1.getNode());
27491 V2 = DAG.getBitcast(ShuffleVT, V2);
27492 DCI.AddToWorklist(V2.getNode());
27493 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27494 DAG.getConstant(PermuteImm, DL, MVT::i8));
27495 DCI.AddToWorklist(Res.getNode());
27496 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27501 // Don't try to re-form single instruction chains under any circumstances now
27502 // that we've done encoding canonicalization for them.
27506 bool MaskContainsZeros =
27507 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27509 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27510 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27511 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27512 ((Subtarget.hasAVX2() &&
27513 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27514 (Subtarget.hasAVX512() &&
27515 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27516 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27517 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27518 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27519 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27520 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27521 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27522 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27523 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27524 DCI.AddToWorklist(VPermMask.getNode());
27525 Res = DAG.getBitcast(MaskVT, V1);
27526 DCI.AddToWorklist(Res.getNode());
27527 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27528 DCI.AddToWorklist(Res.getNode());
27529 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27534 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27535 // vector as the second source.
27536 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27537 ((Subtarget.hasAVX512() &&
27538 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27539 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27540 (Subtarget.hasVLX() &&
27541 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27542 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27543 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27544 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27545 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27546 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27547 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27548 for (unsigned i = 0; i != NumMaskElts; ++i)
27549 if (Mask[i] == SM_SentinelZero)
27550 Mask[i] = NumMaskElts + i;
27552 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27553 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27554 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27555 DCI.AddToWorklist(VPermMask.getNode());
27556 Res = DAG.getBitcast(MaskVT, V1);
27557 DCI.AddToWorklist(Res.getNode());
27558 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27559 DCI.AddToWorklist(Zero.getNode());
27560 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27561 DCI.AddToWorklist(Res.getNode());
27562 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27567 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27568 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27569 ((Subtarget.hasAVX512() &&
27570 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27571 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27572 (Subtarget.hasVLX() &&
27573 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27574 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27575 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27576 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27577 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27578 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27579 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27580 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27581 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27582 DCI.AddToWorklist(VPermMask.getNode());
27583 V1 = DAG.getBitcast(MaskVT, V1);
27584 DCI.AddToWorklist(V1.getNode());
27585 V2 = DAG.getBitcast(MaskVT, V2);
27586 DCI.AddToWorklist(V2.getNode());
27587 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27588 DCI.AddToWorklist(Res.getNode());
27589 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27596 // See if we can combine a single input shuffle with zeros to a bit-mask,
27597 // which is much simpler than any shuffle.
27598 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27599 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27600 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27601 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27602 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27603 APInt UndefElts(NumMaskElts, 0);
27604 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27605 for (unsigned i = 0; i != NumMaskElts; ++i) {
27607 if (M == SM_SentinelUndef) {
27608 UndefElts.setBit(i);
27611 if (M == SM_SentinelZero)
27613 EltBits[i] = AllOnes;
27615 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27616 DCI.AddToWorklist(BitMask.getNode());
27617 Res = DAG.getBitcast(MaskVT, V1);
27618 DCI.AddToWorklist(Res.getNode());
27619 unsigned AndOpcode =
27620 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27621 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27622 DCI.AddToWorklist(Res.getNode());
27623 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27628 // If we have a single input shuffle with different shuffle patterns in the
27629 // the 128-bit lanes use the variable mask to VPERMILPS.
27630 // TODO Combine other mask types at higher depths.
27631 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27632 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27633 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27634 SmallVector<SDValue, 16> VPermIdx;
27635 for (int M : Mask) {
27637 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27638 VPermIdx.push_back(Idx);
27640 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27641 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27642 DCI.AddToWorklist(VPermMask.getNode());
27643 Res = DAG.getBitcast(MaskVT, V1);
27644 DCI.AddToWorklist(Res.getNode());
27645 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27646 DCI.AddToWorklist(Res.getNode());
27647 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27652 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27653 // to VPERMIL2PD/VPERMIL2PS.
27654 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27655 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27656 MaskVT == MVT::v8f32)) {
27657 // VPERMIL2 Operation.
27658 // Bits[3] - Match Bit.
27659 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27660 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27661 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27662 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27663 SmallVector<int, 8> VPerm2Idx;
27664 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27665 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27666 unsigned M2ZImm = 0;
27667 for (int M : Mask) {
27668 if (M == SM_SentinelUndef) {
27669 VPerm2Idx.push_back(-1);
27672 if (M == SM_SentinelZero) {
27674 VPerm2Idx.push_back(8);
27677 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27678 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27679 VPerm2Idx.push_back(Index);
27681 V1 = DAG.getBitcast(MaskVT, V1);
27682 DCI.AddToWorklist(V1.getNode());
27683 V2 = DAG.getBitcast(MaskVT, V2);
27684 DCI.AddToWorklist(V2.getNode());
27685 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27686 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27687 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27688 DAG.getConstant(M2ZImm, DL, MVT::i8));
27689 DCI.AddToWorklist(Res.getNode());
27690 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27695 // If we have 3 or more shuffle instructions or a chain involving a variable
27696 // mask, we can replace them with a single PSHUFB instruction profitably.
27697 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27698 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27699 // more aggressive.
27700 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27701 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27702 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27703 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27704 SmallVector<SDValue, 16> PSHUFBMask;
27705 int NumBytes = RootVT.getSizeInBits() / 8;
27706 int Ratio = NumBytes / NumMaskElts;
27707 for (int i = 0; i < NumBytes; ++i) {
27708 int M = Mask[i / Ratio];
27709 if (M == SM_SentinelUndef) {
27710 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27713 if (M == SM_SentinelZero) {
27714 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27717 M = Ratio * M + i % Ratio;
27718 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27719 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27721 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27722 Res = DAG.getBitcast(ByteVT, V1);
27723 DCI.AddToWorklist(Res.getNode());
27724 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27725 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27726 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27727 DCI.AddToWorklist(Res.getNode());
27728 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27733 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27734 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27735 // slower than PSHUFB on targets that support both.
27736 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27737 Subtarget.hasXOP()) {
27738 // VPPERM Mask Operation
27739 // Bits[4:0] - Byte Index (0 - 31)
27740 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27741 SmallVector<SDValue, 16> VPPERMMask;
27743 int Ratio = NumBytes / NumMaskElts;
27744 for (int i = 0; i < NumBytes; ++i) {
27745 int M = Mask[i / Ratio];
27746 if (M == SM_SentinelUndef) {
27747 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27750 if (M == SM_SentinelZero) {
27751 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27754 M = Ratio * M + i % Ratio;
27755 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27757 MVT ByteVT = MVT::v16i8;
27758 V1 = DAG.getBitcast(ByteVT, V1);
27759 DCI.AddToWorklist(V1.getNode());
27760 V2 = DAG.getBitcast(ByteVT, V2);
27761 DCI.AddToWorklist(V2.getNode());
27762 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27763 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27764 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27765 DCI.AddToWorklist(Res.getNode());
27766 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27771 // Failed to find any combines.
27775 // Attempt to constant fold all of the constant source ops.
27776 // Returns true if the entire shuffle is folded to a constant.
27777 // TODO: Extend this to merge multiple constant Ops and update the mask.
27778 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27779 ArrayRef<int> Mask, SDValue Root,
27780 bool HasVariableMask, SelectionDAG &DAG,
27781 TargetLowering::DAGCombinerInfo &DCI,
27782 const X86Subtarget &Subtarget) {
27783 MVT VT = Root.getSimpleValueType();
27785 unsigned SizeInBits = VT.getSizeInBits();
27786 unsigned NumMaskElts = Mask.size();
27787 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27788 unsigned NumOps = Ops.size();
27790 // Extract constant bits from each source op.
27791 bool OneUseConstantOp = false;
27792 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27793 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27794 for (unsigned i = 0; i != NumOps; ++i) {
27795 SDValue SrcOp = Ops[i];
27796 OneUseConstantOp |= SrcOp.hasOneUse();
27797 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27802 // Only fold if at least one of the constants is only used once or
27803 // the combined shuffle has included a variable mask shuffle, this
27804 // is to avoid constant pool bloat.
27805 if (!OneUseConstantOp && !HasVariableMask)
27808 // Shuffle the constant bits according to the mask.
27809 APInt UndefElts(NumMaskElts, 0);
27810 APInt ZeroElts(NumMaskElts, 0);
27811 APInt ConstantElts(NumMaskElts, 0);
27812 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27813 APInt::getNullValue(MaskSizeInBits));
27814 for (unsigned i = 0; i != NumMaskElts; ++i) {
27816 if (M == SM_SentinelUndef) {
27817 UndefElts.setBit(i);
27819 } else if (M == SM_SentinelZero) {
27820 ZeroElts.setBit(i);
27823 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27825 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27826 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27828 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27829 if (SrcUndefElts[SrcMaskIdx]) {
27830 UndefElts.setBit(i);
27834 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27835 APInt &Bits = SrcEltBits[SrcMaskIdx];
27837 ZeroElts.setBit(i);
27841 ConstantElts.setBit(i);
27842 ConstantBitData[i] = Bits;
27844 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27846 // Create the constant data.
27848 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27849 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27851 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27853 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27856 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27857 DCI.AddToWorklist(CstOp.getNode());
27858 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27862 /// \brief Fully generic combining of x86 shuffle instructions.
27864 /// This should be the last combine run over the x86 shuffle instructions. Once
27865 /// they have been fully optimized, this will recursively consider all chains
27866 /// of single-use shuffle instructions, build a generic model of the cumulative
27867 /// shuffle operation, and check for simpler instructions which implement this
27868 /// operation. We use this primarily for two purposes:
27870 /// 1) Collapse generic shuffles to specialized single instructions when
27871 /// equivalent. In most cases, this is just an encoding size win, but
27872 /// sometimes we will collapse multiple generic shuffles into a single
27873 /// special-purpose shuffle.
27874 /// 2) Look for sequences of shuffle instructions with 3 or more total
27875 /// instructions, and replace them with the slightly more expensive SSSE3
27876 /// PSHUFB instruction if available. We do this as the last combining step
27877 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27878 /// a suitable short sequence of other instructions. The PSHUFB will either
27879 /// use a register or have to read from memory and so is slightly (but only
27880 /// slightly) more expensive than the other shuffle instructions.
27882 /// Because this is inherently a quadratic operation (for each shuffle in
27883 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27884 /// This should never be an issue in practice as the shuffle lowering doesn't
27885 /// produce sequences of more than 8 instructions.
27887 /// FIXME: We will currently miss some cases where the redundant shuffling
27888 /// would simplify under the threshold for PSHUFB formation because of
27889 /// combine-ordering. To fix this, we should do the redundant instruction
27890 /// combining in this recursive walk.
27891 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27892 int SrcOpIndex, SDValue Root,
27893 ArrayRef<int> RootMask,
27894 ArrayRef<const SDNode*> SrcNodes,
27895 int Depth, bool HasVariableMask,
27897 TargetLowering::DAGCombinerInfo &DCI,
27898 const X86Subtarget &Subtarget) {
27899 // Bound the depth of our recursive combine because this is ultimately
27900 // quadratic in nature.
27904 // Directly rip through bitcasts to find the underlying operand.
27905 SDValue Op = SrcOps[SrcOpIndex];
27906 Op = peekThroughOneUseBitcasts(Op);
27908 MVT VT = Op.getSimpleValueType();
27909 if (!VT.isVector())
27910 return false; // Bail if we hit a non-vector.
27912 assert(Root.getSimpleValueType().isVector() &&
27913 "Shuffles operate on vector types!");
27914 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27915 "Can only combine shuffles of the same vector register size.");
27917 // Extract target shuffle mask and resolve sentinels and inputs.
27918 SmallVector<int, 64> OpMask;
27919 SmallVector<SDValue, 2> OpInputs;
27920 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27923 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27924 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27925 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27927 // Add the inputs to the Ops list, avoiding duplicates.
27928 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27930 int InputIdx0 = -1, InputIdx1 = -1;
27931 for (int i = 0, e = Ops.size(); i < e; ++i) {
27932 SDValue BC = peekThroughBitcasts(Ops[i]);
27933 if (Input0 && BC == peekThroughBitcasts(Input0))
27935 if (Input1 && BC == peekThroughBitcasts(Input1))
27939 if (Input0 && InputIdx0 < 0) {
27940 InputIdx0 = SrcOpIndex;
27941 Ops[SrcOpIndex] = Input0;
27943 if (Input1 && InputIdx1 < 0) {
27944 InputIdx1 = Ops.size();
27945 Ops.push_back(Input1);
27948 assert(((RootMask.size() > OpMask.size() &&
27949 RootMask.size() % OpMask.size() == 0) ||
27950 (OpMask.size() > RootMask.size() &&
27951 OpMask.size() % RootMask.size() == 0) ||
27952 OpMask.size() == RootMask.size()) &&
27953 "The smaller number of elements must divide the larger.");
27954 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27955 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27956 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27957 assert(((RootRatio == 1 && OpRatio == 1) ||
27958 (RootRatio == 1) != (OpRatio == 1)) &&
27959 "Must not have a ratio for both incoming and op masks!");
27961 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27963 // Merge this shuffle operation's mask into our accumulated mask. Note that
27964 // this shuffle's mask will be the first applied to the input, followed by the
27965 // root mask to get us all the way to the root value arrangement. The reason
27966 // for this order is that we are recursing up the operation chain.
27967 for (int i = 0; i < MaskWidth; ++i) {
27968 int RootIdx = i / RootRatio;
27969 if (RootMask[RootIdx] < 0) {
27970 // This is a zero or undef lane, we're done.
27971 Mask[i] = RootMask[RootIdx];
27975 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27977 // Just insert the scaled root mask value if it references an input other
27978 // than the SrcOp we're currently inserting.
27979 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27980 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27981 Mask[i] = RootMaskedIdx;
27985 RootMaskedIdx %= MaskWidth;
27987 int OpIdx = RootMaskedIdx / OpRatio;
27988 if (OpMask[OpIdx] < 0) {
27989 // The incoming lanes are zero or undef, it doesn't matter which ones we
27991 Mask[i] = OpMask[OpIdx];
27995 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27996 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27997 OpMaskedIdx %= MaskWidth;
27999 if (OpMask[OpIdx] < (int)OpMask.size()) {
28000 assert(0 <= InputIdx0 && "Unknown target shuffle input");
28001 OpMaskedIdx += InputIdx0 * MaskWidth;
28003 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28004 OpMaskedIdx += InputIdx1 * MaskWidth;
28007 Mask[i] = OpMaskedIdx;
28010 // Handle the all undef/zero cases early.
28011 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28012 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28015 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28016 // TODO - should we handle the mixed zero/undef case as well? Just returning
28017 // a zero mask will lose information on undef elements possibly reducing
28018 // future combine possibilities.
28019 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28020 Subtarget, DAG, SDLoc(Root)));
28024 // Remove unused shuffle source ops.
28025 resolveTargetShuffleInputsAndMask(Ops, Mask);
28026 assert(!Ops.empty() && "Shuffle with no inputs detected");
28028 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28030 // Update the list of shuffle nodes that have been combined so far.
28031 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28033 CombinedNodes.push_back(Op.getNode());
28035 // See if we can recurse into each shuffle source op (if it's a target
28036 // shuffle). The source op should only be combined if it either has a
28037 // single use (i.e. current Op) or all its users have already been combined.
28038 for (int i = 0, e = Ops.size(); i < e; ++i)
28039 if (Ops[i].getNode()->hasOneUse() ||
28040 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28041 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28042 Depth + 1, HasVariableMask, DAG, DCI,
28046 // Attempt to constant fold all of the constant source ops.
28047 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28051 // We can only combine unary and binary shuffle mask cases.
28052 if (Ops.size() > 2)
28055 // Minor canonicalization of the accumulated shuffle mask to make it easier
28056 // to match below. All this does is detect masks with sequential pairs of
28057 // elements, and shrink them to the half-width mask. It does this in a loop
28058 // so it will reduce the size of the mask to the minimal width mask which
28059 // performs an equivalent shuffle.
28060 SmallVector<int, 64> WidenedMask;
28061 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28062 Mask = std::move(WidenedMask);
28065 // Canonicalization of binary shuffle masks to improve pattern matching by
28066 // commuting the inputs.
28067 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28068 ShuffleVectorSDNode::commuteMask(Mask);
28069 std::swap(Ops[0], Ops[1]);
28072 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28076 /// \brief Get the PSHUF-style mask from PSHUF node.
28078 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28079 /// PSHUF-style masks that can be reused with such instructions.
28080 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28081 MVT VT = N.getSimpleValueType();
28082 SmallVector<int, 4> Mask;
28083 SmallVector<SDValue, 2> Ops;
28086 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28090 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28091 // matter. Check that the upper masks are repeats and remove them.
28092 if (VT.getSizeInBits() > 128) {
28093 int LaneElts = 128 / VT.getScalarSizeInBits();
28095 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28096 for (int j = 0; j < LaneElts; ++j)
28097 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28098 "Mask doesn't repeat in high 128-bit lanes!");
28100 Mask.resize(LaneElts);
28103 switch (N.getOpcode()) {
28104 case X86ISD::PSHUFD:
28106 case X86ISD::PSHUFLW:
28109 case X86ISD::PSHUFHW:
28110 Mask.erase(Mask.begin(), Mask.begin() + 4);
28111 for (int &M : Mask)
28115 llvm_unreachable("No valid shuffle instruction found!");
28119 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28121 /// We walk up the chain and look for a combinable shuffle, skipping over
28122 /// shuffles that we could hoist this shuffle's transformation past without
28123 /// altering anything.
28125 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28126 SelectionDAG &DAG) {
28127 assert(N.getOpcode() == X86ISD::PSHUFD &&
28128 "Called with something other than an x86 128-bit half shuffle!");
28131 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28132 // of the shuffles in the chain so that we can form a fresh chain to replace
28134 SmallVector<SDValue, 8> Chain;
28135 SDValue V = N.getOperand(0);
28136 for (; V.hasOneUse(); V = V.getOperand(0)) {
28137 switch (V.getOpcode()) {
28139 return SDValue(); // Nothing combined!
28142 // Skip bitcasts as we always know the type for the target specific
28146 case X86ISD::PSHUFD:
28147 // Found another dword shuffle.
28150 case X86ISD::PSHUFLW:
28151 // Check that the low words (being shuffled) are the identity in the
28152 // dword shuffle, and the high words are self-contained.
28153 if (Mask[0] != 0 || Mask[1] != 1 ||
28154 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28157 Chain.push_back(V);
28160 case X86ISD::PSHUFHW:
28161 // Check that the high words (being shuffled) are the identity in the
28162 // dword shuffle, and the low words are self-contained.
28163 if (Mask[2] != 2 || Mask[3] != 3 ||
28164 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28167 Chain.push_back(V);
28170 case X86ISD::UNPCKL:
28171 case X86ISD::UNPCKH:
28172 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28173 // shuffle into a preceding word shuffle.
28174 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28175 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28178 // Search for a half-shuffle which we can combine with.
28179 unsigned CombineOp =
28180 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28181 if (V.getOperand(0) != V.getOperand(1) ||
28182 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28184 Chain.push_back(V);
28185 V = V.getOperand(0);
28187 switch (V.getOpcode()) {
28189 return SDValue(); // Nothing to combine.
28191 case X86ISD::PSHUFLW:
28192 case X86ISD::PSHUFHW:
28193 if (V.getOpcode() == CombineOp)
28196 Chain.push_back(V);
28200 V = V.getOperand(0);
28204 } while (V.hasOneUse());
28207 // Break out of the loop if we break out of the switch.
28211 if (!V.hasOneUse())
28212 // We fell out of the loop without finding a viable combining instruction.
28215 // Merge this node's mask and our incoming mask.
28216 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28217 for (int &M : Mask)
28219 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28220 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28222 // Rebuild the chain around this new shuffle.
28223 while (!Chain.empty()) {
28224 SDValue W = Chain.pop_back_val();
28226 if (V.getValueType() != W.getOperand(0).getValueType())
28227 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28229 switch (W.getOpcode()) {
28231 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28233 case X86ISD::UNPCKL:
28234 case X86ISD::UNPCKH:
28235 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28238 case X86ISD::PSHUFD:
28239 case X86ISD::PSHUFLW:
28240 case X86ISD::PSHUFHW:
28241 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28245 if (V.getValueType() != N.getValueType())
28246 V = DAG.getBitcast(N.getValueType(), V);
28248 // Return the new chain to replace N.
28252 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28255 /// We walk up the chain, skipping shuffles of the other half and looking
28256 /// through shuffles which switch halves trying to find a shuffle of the same
28257 /// pair of dwords.
28258 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28260 TargetLowering::DAGCombinerInfo &DCI) {
28262 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28263 "Called with something other than an x86 128-bit half shuffle!");
28265 unsigned CombineOpcode = N.getOpcode();
28267 // Walk up a single-use chain looking for a combinable shuffle.
28268 SDValue V = N.getOperand(0);
28269 for (; V.hasOneUse(); V = V.getOperand(0)) {
28270 switch (V.getOpcode()) {
28272 return false; // Nothing combined!
28275 // Skip bitcasts as we always know the type for the target specific
28279 case X86ISD::PSHUFLW:
28280 case X86ISD::PSHUFHW:
28281 if (V.getOpcode() == CombineOpcode)
28284 // Other-half shuffles are no-ops.
28287 // Break out of the loop if we break out of the switch.
28291 if (!V.hasOneUse())
28292 // We fell out of the loop without finding a viable combining instruction.
28295 // Combine away the bottom node as its shuffle will be accumulated into
28296 // a preceding shuffle.
28297 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28299 // Record the old value.
28302 // Merge this node's mask and our incoming mask (adjusted to account for all
28303 // the pshufd instructions encountered).
28304 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28305 for (int &M : Mask)
28307 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28308 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28310 // Check that the shuffles didn't cancel each other out. If not, we need to
28311 // combine to the new one.
28313 // Replace the combinable shuffle with the combined one, updating all users
28314 // so that we re-evaluate the chain here.
28315 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28320 /// \brief Try to combine x86 target specific shuffles.
28321 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28322 TargetLowering::DAGCombinerInfo &DCI,
28323 const X86Subtarget &Subtarget) {
28325 MVT VT = N.getSimpleValueType();
28326 SmallVector<int, 4> Mask;
28328 unsigned Opcode = N.getOpcode();
28330 case X86ISD::PSHUFD:
28331 case X86ISD::PSHUFLW:
28332 case X86ISD::PSHUFHW:
28333 Mask = getPSHUFShuffleMask(N);
28334 assert(Mask.size() == 4);
28336 case X86ISD::UNPCKL: {
28337 auto Op0 = N.getOperand(0);
28338 auto Op1 = N.getOperand(1);
28339 unsigned Opcode0 = Op0.getOpcode();
28340 unsigned Opcode1 = Op1.getOpcode();
28342 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28343 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28344 // TODO: Add other horizontal operations as required.
28345 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28346 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28348 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28349 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28350 // moves upper half elements into the lower half part. For example:
28352 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28354 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28356 // will be combined to:
28358 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28360 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28361 // happen due to advanced instructions.
28362 if (!VT.is128BitVector())
28365 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28366 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28368 unsigned NumElts = VT.getVectorNumElements();
28369 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28370 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28373 auto ShufOp = Op1.getOperand(0);
28374 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28375 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28379 case X86ISD::BLENDI: {
28380 SDValue V0 = N->getOperand(0);
28381 SDValue V1 = N->getOperand(1);
28382 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28383 "Unexpected input vector types");
28385 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28386 // operands and changing the mask to 1. This saves us a bunch of
28387 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28388 // x86InstrInfo knows how to commute this back after instruction selection
28389 // if it would help register allocation.
28391 // TODO: If optimizing for size or a processor that doesn't suffer from
28392 // partial register update stalls, this should be transformed into a MOVSD
28393 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28395 if (VT == MVT::v2f64)
28396 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28397 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28398 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28399 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28404 case X86ISD::MOVSD:
28405 case X86ISD::MOVSS: {
28406 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28407 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28408 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28409 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28410 if (isZero0 && isZero1)
28413 // We often lower to MOVSD/MOVSS from integer as well as native float
28414 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28415 // easier to combine shuffles later on. We've already accounted for the
28416 // domain switching cost when we decided to lower with it.
28417 bool isFloat = VT.isFloatingPoint();
28418 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28419 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28420 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28421 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28422 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28423 V0 = DAG.getBitcast(NewVT, V0);
28424 V1 = DAG.getBitcast(NewVT, V1);
28425 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28430 case X86ISD::INSERTPS: {
28431 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28432 SDValue Op0 = N.getOperand(0);
28433 SDValue Op1 = N.getOperand(1);
28434 SDValue Op2 = N.getOperand(2);
28435 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28436 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28437 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28438 unsigned ZeroMask = InsertPSMask & 0xF;
28440 // If we zero out all elements from Op0 then we don't need to reference it.
28441 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28442 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28443 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28445 // If we zero out the element from Op1 then we don't need to reference it.
28446 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28447 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28448 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28450 // Attempt to merge insertps Op1 with an inner target shuffle node.
28451 SmallVector<int, 8> TargetMask1;
28452 SmallVector<SDValue, 2> Ops1;
28453 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28454 int M = TargetMask1[SrcIdx];
28455 if (isUndefOrZero(M)) {
28456 // Zero/UNDEF insertion - zero out element and remove dependency.
28457 InsertPSMask |= (1u << DstIdx);
28458 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28459 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28461 // Update insertps mask srcidx and reference the source input directly.
28462 assert(0 <= M && M < 8 && "Shuffle index out of range");
28463 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28464 Op1 = Ops1[M < 4 ? 0 : 1];
28465 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28466 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28469 // Attempt to merge insertps Op0 with an inner target shuffle node.
28470 SmallVector<int, 8> TargetMask0;
28471 SmallVector<SDValue, 2> Ops0;
28472 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28475 bool Updated = false;
28476 bool UseInput00 = false;
28477 bool UseInput01 = false;
28478 for (int i = 0; i != 4; ++i) {
28479 int M = TargetMask0[i];
28480 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28481 // No change if element is already zero or the inserted element.
28483 } else if (isUndefOrZero(M)) {
28484 // If the target mask is undef/zero then we must zero the element.
28485 InsertPSMask |= (1u << i);
28490 // The input vector element must be inline.
28491 if (M != i && M != (i + 4))
28494 // Determine which inputs of the target shuffle we're using.
28495 UseInput00 |= (0 <= M && M < 4);
28496 UseInput01 |= (4 <= M);
28499 // If we're not using both inputs of the target shuffle then use the
28500 // referenced input directly.
28501 if (UseInput00 && !UseInput01) {
28504 } else if (!UseInput00 && UseInput01) {
28510 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28511 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28519 // Nuke no-op shuffles that show up after combining.
28520 if (isNoopShuffleMask(Mask))
28521 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28523 // Look for simplifications involving one or two shuffle instructions.
28524 SDValue V = N.getOperand(0);
28525 switch (N.getOpcode()) {
28528 case X86ISD::PSHUFLW:
28529 case X86ISD::PSHUFHW:
28530 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28532 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28533 return SDValue(); // We combined away this shuffle, so we're done.
28535 // See if this reduces to a PSHUFD which is no more expensive and can
28536 // combine with more operations. Note that it has to at least flip the
28537 // dwords as otherwise it would have been removed as a no-op.
28538 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28539 int DMask[] = {0, 1, 2, 3};
28540 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28541 DMask[DOffset + 0] = DOffset + 1;
28542 DMask[DOffset + 1] = DOffset + 0;
28543 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28544 V = DAG.getBitcast(DVT, V);
28545 DCI.AddToWorklist(V.getNode());
28546 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28547 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28548 DCI.AddToWorklist(V.getNode());
28549 return DAG.getBitcast(VT, V);
28552 // Look for shuffle patterns which can be implemented as a single unpack.
28553 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28554 // only works when we have a PSHUFD followed by two half-shuffles.
28555 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28556 (V.getOpcode() == X86ISD::PSHUFLW ||
28557 V.getOpcode() == X86ISD::PSHUFHW) &&
28558 V.getOpcode() != N.getOpcode() &&
28560 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28561 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28562 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28563 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28564 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28565 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28567 for (int i = 0; i < 4; ++i) {
28568 WordMask[i + NOffset] = Mask[i] + NOffset;
28569 WordMask[i + VOffset] = VMask[i] + VOffset;
28571 // Map the word mask through the DWord mask.
28573 for (int i = 0; i < 8; ++i)
28574 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28575 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28576 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28577 // We can replace all three shuffles with an unpack.
28578 V = DAG.getBitcast(VT, D.getOperand(0));
28579 DCI.AddToWorklist(V.getNode());
28580 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28589 case X86ISD::PSHUFD:
28590 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28599 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28600 /// operation. If true is returned then the operands of ADDSUB operation
28601 /// are written to the parameters \p Opnd0 and \p Opnd1.
28603 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28604 /// so it is easier to generically match. We also insert dummy vector shuffle
28605 /// nodes for the operands which explicitly discard the lanes which are unused
28606 /// by this operation to try to flow through the rest of the combiner
28607 /// the fact that they're unused.
28608 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28609 SDValue &Opnd0, SDValue &Opnd1) {
28611 EVT VT = N->getValueType(0);
28612 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28613 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28614 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28617 // We only handle target-independent shuffles.
28618 // FIXME: It would be easy and harmless to use the target shuffle mask
28619 // extraction tool to support more.
28620 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28623 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28624 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28626 SDValue V1 = N->getOperand(0);
28627 SDValue V2 = N->getOperand(1);
28629 // We require the first shuffle operand to be the FSUB node, and the second to
28630 // be the FADD node.
28631 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28632 ShuffleVectorSDNode::commuteMask(Mask);
28634 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28637 // If there are other uses of these operations we can't fold them.
28638 if (!V1->hasOneUse() || !V2->hasOneUse())
28641 // Ensure that both operations have the same operands. Note that we can
28642 // commute the FADD operands.
28643 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28644 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28645 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28648 // We're looking for blends between FADD and FSUB nodes. We insist on these
28649 // nodes being lined up in a specific expected pattern.
28650 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28651 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28652 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28653 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28654 8, 25, 10, 27, 12, 29, 14, 31})))
28662 /// \brief Try to combine a shuffle into a target-specific add-sub or
28663 /// mul-add-sub node.
28664 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28665 const X86Subtarget &Subtarget,
28666 SelectionDAG &DAG) {
28667 SDValue Opnd0, Opnd1;
28668 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28671 EVT VT = N->getValueType(0);
28674 // Try to generate X86ISD::FMADDSUB node here.
28676 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28677 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28679 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28680 // the ADDSUB idiom has been successfully recognized. There are no known
28681 // X86 targets with 512-bit ADDSUB instructions!
28682 if (VT.is512BitVector())
28685 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28688 // We are looking for a shuffle where both sources are concatenated with undef
28689 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28690 // if we can express this as a single-source shuffle, that's preferable.
28691 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28692 const X86Subtarget &Subtarget) {
28693 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28696 EVT VT = N->getValueType(0);
28698 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28699 if (!VT.is128BitVector() && !VT.is256BitVector())
28702 if (VT.getVectorElementType() != MVT::i32 &&
28703 VT.getVectorElementType() != MVT::i64 &&
28704 VT.getVectorElementType() != MVT::f32 &&
28705 VT.getVectorElementType() != MVT::f64)
28708 SDValue N0 = N->getOperand(0);
28709 SDValue N1 = N->getOperand(1);
28711 // Check that both sources are concats with undef.
28712 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28713 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28714 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28715 !N1.getOperand(1).isUndef())
28718 // Construct the new shuffle mask. Elements from the first source retain their
28719 // index, but elements from the second source no longer need to skip an undef.
28720 SmallVector<int, 8> Mask;
28721 int NumElts = VT.getVectorNumElements();
28723 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28724 for (int Elt : SVOp->getMask())
28725 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28728 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28730 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28733 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28734 TargetLowering::DAGCombinerInfo &DCI,
28735 const X86Subtarget &Subtarget) {
28737 EVT VT = N->getValueType(0);
28738 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28739 // If we have legalized the vector types, look for blends of FADD and FSUB
28740 // nodes that we can fuse into an ADDSUB node.
28741 if (TLI.isTypeLegal(VT))
28742 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28745 // During Type Legalization, when promoting illegal vector types,
28746 // the backend might introduce new shuffle dag nodes and bitcasts.
28748 // This code performs the following transformation:
28749 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28750 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28752 // We do this only if both the bitcast and the BINOP dag nodes have
28753 // one use. Also, perform this transformation only if the new binary
28754 // operation is legal. This is to avoid introducing dag nodes that
28755 // potentially need to be further expanded (or custom lowered) into a
28756 // less optimal sequence of dag nodes.
28757 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28758 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28759 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28760 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28761 SDValue N0 = N->getOperand(0);
28762 SDValue N1 = N->getOperand(1);
28764 SDValue BC0 = N0.getOperand(0);
28765 EVT SVT = BC0.getValueType();
28766 unsigned Opcode = BC0.getOpcode();
28767 unsigned NumElts = VT.getVectorNumElements();
28769 if (BC0.hasOneUse() && SVT.isVector() &&
28770 SVT.getVectorNumElements() * 2 == NumElts &&
28771 TLI.isOperationLegal(Opcode, VT)) {
28772 bool CanFold = false;
28778 // isOperationLegal lies for integer ops on floating point types.
28779 CanFold = VT.isInteger();
28784 // isOperationLegal lies for floating point ops on integer types.
28785 CanFold = VT.isFloatingPoint();
28789 unsigned SVTNumElts = SVT.getVectorNumElements();
28790 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28791 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28792 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28793 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28794 CanFold = SVOp->getMaskElt(i) < 0;
28797 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28798 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28799 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28800 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28805 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28806 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28807 // consecutive, non-overlapping, and in the right order.
28808 SmallVector<SDValue, 16> Elts;
28809 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28810 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28811 Elts.push_back(Elt);
28818 if (Elts.size() == VT.getVectorNumElements())
28819 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28822 // For AVX2, we sometimes want to combine
28823 // (vector_shuffle <mask> (concat_vectors t1, undef)
28824 // (concat_vectors t2, undef))
28826 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28827 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28828 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28831 if (isTargetShuffle(N->getOpcode())) {
28833 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28836 // Try recursively combining arbitrary sequences of x86 shuffle
28837 // instructions into higher-order shuffles. We do this after combining
28838 // specific PSHUF instruction sequences into their minimal form so that we
28839 // can evaluate how many specialized shuffle instructions are involved in
28840 // a particular chain.
28841 SmallVector<int, 1> NonceMask; // Just a placeholder.
28842 NonceMask.push_back(0);
28843 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28844 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28846 return SDValue(); // This routine will use CombineTo to replace N.
28852 /// Check if a vector extract from a target-specific shuffle of a load can be
28853 /// folded into a single element load.
28854 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28855 /// shuffles have been custom lowered so we need to handle those here.
28856 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28857 TargetLowering::DAGCombinerInfo &DCI) {
28858 if (DCI.isBeforeLegalizeOps())
28861 SDValue InVec = N->getOperand(0);
28862 SDValue EltNo = N->getOperand(1);
28863 EVT EltVT = N->getValueType(0);
28865 if (!isa<ConstantSDNode>(EltNo))
28868 EVT OriginalVT = InVec.getValueType();
28870 // Peek through bitcasts, don't duplicate a load with other uses.
28871 InVec = peekThroughOneUseBitcasts(InVec);
28873 EVT CurrentVT = InVec.getValueType();
28874 if (!CurrentVT.isVector() ||
28875 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28878 if (!isTargetShuffle(InVec.getOpcode()))
28881 // Don't duplicate a load with other uses.
28882 if (!InVec.hasOneUse())
28885 SmallVector<int, 16> ShuffleMask;
28886 SmallVector<SDValue, 2> ShuffleOps;
28888 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28889 ShuffleOps, ShuffleMask, UnaryShuffle))
28892 // Select the input vector, guarding against out of range extract vector.
28893 unsigned NumElems = CurrentVT.getVectorNumElements();
28894 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28895 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28897 if (Idx == SM_SentinelZero)
28898 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28899 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28900 if (Idx == SM_SentinelUndef)
28901 return DAG.getUNDEF(EltVT);
28903 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28904 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28907 // If inputs to shuffle are the same for both ops, then allow 2 uses
28908 unsigned AllowedUses =
28909 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28911 if (LdNode.getOpcode() == ISD::BITCAST) {
28912 // Don't duplicate a load with other uses.
28913 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28916 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28917 LdNode = LdNode.getOperand(0);
28920 if (!ISD::isNormalLoad(LdNode.getNode()))
28923 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28925 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28928 // If there's a bitcast before the shuffle, check if the load type and
28929 // alignment is valid.
28930 unsigned Align = LN0->getAlignment();
28931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28932 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28933 EltVT.getTypeForEVT(*DAG.getContext()));
28935 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28938 // All checks match so transform back to vector_shuffle so that DAG combiner
28939 // can finish the job
28942 // Create shuffle node taking into account the case that its a unary shuffle
28943 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28944 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28946 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28947 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28951 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28952 const X86Subtarget &Subtarget) {
28953 SDValue N0 = N->getOperand(0);
28954 EVT VT = N->getValueType(0);
28955 EVT SrcVT = N0.getValueType();
28957 // Since MMX types are special and don't usually play with other vector types,
28958 // it's better to handle them early to be sure we emit efficient code by
28959 // avoiding store-load conversions.
28961 // Detect bitcasts between i32 to x86mmx low word.
28962 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28963 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
28964 SDValue N00 = N0->getOperand(0);
28965 if (N00.getValueType() == MVT::i32)
28966 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28969 // Detect bitcasts between element or subvector extraction to x86mmx.
28970 if (VT == MVT::x86mmx &&
28971 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
28972 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
28973 isNullConstant(N0.getOperand(1))) {
28974 SDValue N00 = N0->getOperand(0);
28975 if (N00.getValueType().is128BitVector())
28976 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
28977 DAG.getBitcast(MVT::v2i64, N00));
28980 // Detect bitcasts from FP_TO_SINT to x86mmx.
28981 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
28982 N0.getOpcode() == ISD::FP_TO_SINT) {
28984 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
28985 DAG.getUNDEF(MVT::v2i32));
28986 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
28987 DAG.getBitcast(MVT::v2i64, Res));
28990 // Convert a bitcasted integer logic operation that has one bitcasted
28991 // floating-point operand into a floating-point logic operation. This may
28992 // create a load of a constant, but that is cheaper than materializing the
28993 // constant in an integer register and transferring it to an SSE register or
28994 // transferring the SSE operand to integer register and back.
28996 switch (N0.getOpcode()) {
28997 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28998 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28999 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29000 default: return SDValue();
29003 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29004 (Subtarget.hasSSE2() && VT == MVT::f64)))
29007 SDValue LogicOp0 = N0.getOperand(0);
29008 SDValue LogicOp1 = N0.getOperand(1);
29011 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29012 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29013 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29014 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29015 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29016 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29018 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29019 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29020 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29021 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29022 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29023 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29029 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29030 // the elements of a vector.
29031 // Returns the vector that is being reduced on, or SDValue() if a reduction
29032 // was not matched.
29033 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29034 // The pattern must end in an extract from index 0.
29035 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29036 !isNullConstant(Extract->getOperand(1)))
29040 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29042 SDValue Op = Extract->getOperand(0);
29043 // At each stage, we're looking for something that looks like:
29044 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29045 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29046 // i32 undef, i32 undef, i32 undef, i32 undef>
29047 // %a = binop <8 x i32> %op, %s
29048 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29049 // we expect something like:
29050 // <4,5,6,7,u,u,u,u>
29051 // <2,3,u,u,u,u,u,u>
29052 // <1,u,u,u,u,u,u,u>
29053 for (unsigned i = 0; i < Stages; ++i) {
29054 if (Op.getOpcode() != BinOp)
29057 ShuffleVectorSDNode *Shuffle =
29058 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29060 Op = Op.getOperand(1);
29062 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29063 Op = Op.getOperand(0);
29066 // The first operand of the shuffle should be the same as the other operand
29068 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29071 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29072 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29073 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29080 // Given a select, detect the following pattern:
29081 // 1: %2 = zext <N x i8> %0 to <N x i32>
29082 // 2: %3 = zext <N x i8> %1 to <N x i32>
29083 // 3: %4 = sub nsw <N x i32> %2, %3
29084 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29085 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29086 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29087 // This is useful as it is the input into a SAD pattern.
29088 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29090 // Check the condition of the select instruction is greater-than.
29091 SDValue SetCC = Select->getOperand(0);
29092 if (SetCC.getOpcode() != ISD::SETCC)
29094 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29095 if (CC != ISD::SETGT && CC != ISD::SETLT)
29098 SDValue SelectOp1 = Select->getOperand(1);
29099 SDValue SelectOp2 = Select->getOperand(2);
29101 // The following instructions assume SelectOp1 is the subtraction operand
29102 // and SelectOp2 is the negation operand.
29103 // In the case of SETLT this is the other way around.
29104 if (CC == ISD::SETLT)
29105 std::swap(SelectOp1, SelectOp2);
29107 // The second operand of the select should be the negation of the first
29108 // operand, which is implemented as 0 - SelectOp1.
29109 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29110 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29111 SelectOp2.getOperand(1) == SelectOp1))
29114 // The first operand of SetCC is the first operand of the select, which is the
29115 // difference between the two input vectors.
29116 if (SetCC.getOperand(0) != SelectOp1)
29119 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29121 if ((CC == ISD::SETLT) &&
29122 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29124 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29127 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29128 if ((CC == ISD::SETGT) &&
29129 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29130 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29133 // The first operand of the select is the difference between the two input
29135 if (SelectOp1.getOpcode() != ISD::SUB)
29138 Op0 = SelectOp1.getOperand(0);
29139 Op1 = SelectOp1.getOperand(1);
29141 // Check if the operands of the sub are zero-extended from vectors of i8.
29142 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29143 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29144 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29145 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29151 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29153 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29154 const SDValue &Zext1, const SDLoc &DL) {
29156 // Find the appropriate width for the PSADBW.
29157 EVT InVT = Zext0.getOperand(0).getValueType();
29158 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29160 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29161 // fill in the missing vector elements with 0.
29162 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29163 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29164 Ops[0] = Zext0.getOperand(0);
29165 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29166 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29167 Ops[0] = Zext1.getOperand(0);
29168 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29170 // Actually build the SAD
29171 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29172 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29175 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29176 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29178 const X86Subtarget &Subtarget) {
29179 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29180 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29183 EVT ExtractVT = Extract->getValueType(0);
29184 unsigned BitWidth = ExtractVT.getSizeInBits();
29185 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29186 ExtractVT != MVT::i8)
29189 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29190 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29191 SDValue Match = matchBinOpReduction(Extract, Op);
29195 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29196 // which we can't support here for now.
29197 if (Match.getScalarValueSizeInBits() != BitWidth)
29200 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29201 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29202 if (!(MatchSizeInBits == 128 ||
29203 (MatchSizeInBits == 256 &&
29204 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29207 // Don't bother performing this for 2-element vectors.
29208 if (Match.getValueType().getVectorNumElements() <= 2)
29211 // Check that we are extracting a reduction of all sign bits.
29212 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29215 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29217 if (64 == BitWidth || 32 == BitWidth)
29218 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29219 MatchSizeInBits / BitWidth);
29221 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29224 ISD::CondCode CondCode;
29225 if (Op == ISD::OR) {
29226 // any_of -> MOVMSK != 0
29227 CompareBits = APInt::getNullValue(32);
29228 CondCode = ISD::CondCode::SETNE;
29230 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29231 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29232 CondCode = ISD::CondCode::SETEQ;
29235 // Perform the select as i32/i64 and then truncate to avoid partial register
29237 unsigned ResWidth = std::max(BitWidth, 32u);
29238 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29240 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29241 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29242 SDValue Res = DAG.getBitcast(MaskVT, Match);
29243 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29244 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29245 Ones, Zero, CondCode);
29246 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29252 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29253 const X86Subtarget &Subtarget) {
29254 // PSADBW is only supported on SSE2 and up.
29255 if (!Subtarget.hasSSE2())
29258 // Verify the type we're extracting from is any integer type above i16.
29259 EVT VT = Extract->getOperand(0).getValueType();
29260 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29263 unsigned RegSize = 128;
29264 if (Subtarget.hasBWI())
29266 else if (Subtarget.hasAVX2())
29269 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29270 // TODO: We should be able to handle larger vectors by splitting them before
29271 // feeding them into several SADs, and then reducing over those.
29272 if (RegSize / VT.getVectorNumElements() < 8)
29275 // Match shuffle + add pyramid.
29276 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29278 // The operand is expected to be zero extended from i8
29279 // (verified in detectZextAbsDiff).
29280 // In order to convert to i64 and above, additional any/zero/sign
29281 // extend is expected.
29282 // The zero extend from 32 bit has no mathematical effect on the result.
29283 // Also the sign extend is basically zero extend
29284 // (extends the sign bit which is zero).
29285 // So it is correct to skip the sign/zero extend instruction.
29286 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29287 Root.getOpcode() == ISD::ZERO_EXTEND ||
29288 Root.getOpcode() == ISD::ANY_EXTEND))
29289 Root = Root.getOperand(0);
29291 // If there was a match, we want Root to be a select that is the root of an
29292 // abs-diff pattern.
29293 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29296 // Check whether we have an abs-diff pattern feeding into the select.
29297 SDValue Zext0, Zext1;
29298 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29301 // Create the SAD instruction.
29303 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29305 // If the original vector was wider than 8 elements, sum over the results
29306 // in the SAD vector.
29307 unsigned Stages = Log2_32(VT.getVectorNumElements());
29308 MVT SadVT = SAD.getSimpleValueType();
29310 unsigned SadElems = SadVT.getVectorNumElements();
29312 for(unsigned i = Stages - 3; i > 0; --i) {
29313 SmallVector<int, 16> Mask(SadElems, -1);
29314 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29315 Mask[j] = MaskEnd + j;
29318 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29319 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29323 MVT Type = Extract->getSimpleValueType(0);
29324 unsigned TypeSizeInBits = Type.getSizeInBits();
29325 // Return the lowest TypeSizeInBits bits.
29326 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29327 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29328 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29329 Extract->getOperand(1));
29332 // Attempt to peek through a target shuffle and extract the scalar from the
29334 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29335 TargetLowering::DAGCombinerInfo &DCI,
29336 const X86Subtarget &Subtarget) {
29337 if (DCI.isBeforeLegalizeOps())
29340 SDValue Src = N->getOperand(0);
29341 SDValue Idx = N->getOperand(1);
29343 EVT VT = N->getValueType(0);
29344 EVT SrcVT = Src.getValueType();
29345 EVT SrcSVT = SrcVT.getVectorElementType();
29346 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29348 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29349 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29352 // Resolve the target shuffle inputs and mask.
29353 SmallVector<int, 16> Mask;
29354 SmallVector<SDValue, 2> Ops;
29355 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29358 // Attempt to narrow/widen the shuffle mask to the correct size.
29359 if (Mask.size() != NumSrcElts) {
29360 if ((NumSrcElts % Mask.size()) == 0) {
29361 SmallVector<int, 16> ScaledMask;
29362 int Scale = NumSrcElts / Mask.size();
29363 scaleShuffleMask(Scale, Mask, ScaledMask);
29364 Mask = std::move(ScaledMask);
29365 } else if ((Mask.size() % NumSrcElts) == 0) {
29366 SmallVector<int, 16> WidenedMask;
29367 while (Mask.size() > NumSrcElts &&
29368 canWidenShuffleElements(Mask, WidenedMask))
29369 Mask = std::move(WidenedMask);
29370 // TODO - investigate support for wider shuffle masks with known upper
29371 // undef/zero elements for implicit zero-extension.
29375 // Check if narrowing/widening failed.
29376 if (Mask.size() != NumSrcElts)
29379 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29382 // If the shuffle source element is undef/zero then we can just accept it.
29383 if (SrcIdx == SM_SentinelUndef)
29384 return DAG.getUNDEF(VT);
29386 if (SrcIdx == SM_SentinelZero)
29387 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29388 : DAG.getConstant(0, dl, VT);
29390 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29391 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29392 SrcIdx = SrcIdx % Mask.size();
29394 // We can only extract other elements from 128-bit vectors and in certain
29395 // circumstances, depending on SSE-level.
29396 // TODO: Investigate using extract_subvector for larger vectors.
29397 // TODO: Investigate float/double extraction if it will be just stored.
29398 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29399 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29400 assert(SrcSVT == VT && "Unexpected extraction type");
29401 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29402 DAG.getIntPtrConstant(SrcIdx, dl));
29405 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29406 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29407 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29408 "Unexpected extraction type");
29409 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29410 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29411 DAG.getIntPtrConstant(SrcIdx, dl));
29412 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29413 DAG.getValueType(SrcSVT));
29414 return DAG.getZExtOrTrunc(Assert, dl, VT);
29420 /// Detect vector gather/scatter index generation and convert it from being a
29421 /// bunch of shuffles and extracts into a somewhat faster sequence.
29422 /// For i686, the best sequence is apparently storing the value and loading
29423 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29424 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29425 TargetLowering::DAGCombinerInfo &DCI,
29426 const X86Subtarget &Subtarget) {
29427 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29430 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29433 SDValue InputVector = N->getOperand(0);
29434 SDValue EltIdx = N->getOperand(1);
29436 EVT SrcVT = InputVector.getValueType();
29437 EVT VT = N->getValueType(0);
29438 SDLoc dl(InputVector);
29440 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29441 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29442 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29443 SDValue MMXSrc = InputVector.getOperand(0);
29445 // The bitcast source is a direct mmx result.
29446 if (MMXSrc.getValueType() == MVT::x86mmx)
29447 return DAG.getBitcast(VT, InputVector);
29450 // Detect mmx to i32 conversion through a v2i32 elt extract.
29451 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29452 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29453 SDValue MMXSrc = InputVector.getOperand(0);
29455 // The bitcast source is a direct mmx result.
29456 if (MMXSrc.getValueType() == MVT::x86mmx)
29457 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29460 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29461 isa<ConstantSDNode>(EltIdx) &&
29462 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29463 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29464 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29465 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29466 return DAG.getConstant(Res, dl, MVT::i1);
29469 // Check whether this extract is the root of a sum of absolute differences
29470 // pattern. This has to be done here because we really want it to happen
29471 // pre-legalization,
29472 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29475 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29476 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29479 // Only operate on vectors of 4 elements, where the alternative shuffling
29480 // gets to be more expensive.
29481 if (SrcVT != MVT::v4i32)
29484 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29485 // single use which is a sign-extend or zero-extend, and all elements are
29487 SmallVector<SDNode *, 4> Uses;
29488 unsigned ExtractedElements = 0;
29489 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29490 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29491 if (UI.getUse().getResNo() != InputVector.getResNo())
29494 SDNode *Extract = *UI;
29495 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29498 if (Extract->getValueType(0) != MVT::i32)
29500 if (!Extract->hasOneUse())
29502 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29503 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29505 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29508 // Record which element was extracted.
29509 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29510 Uses.push_back(Extract);
29513 // If not all the elements were used, this may not be worthwhile.
29514 if (ExtractedElements != 15)
29517 // Ok, we've now decided to do the transformation.
29518 // If 64-bit shifts are legal, use the extract-shift sequence,
29519 // otherwise bounce the vector off the cache.
29520 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29523 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29524 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29525 auto &DL = DAG.getDataLayout();
29526 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29527 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29528 DAG.getConstant(0, dl, VecIdxTy));
29529 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29530 DAG.getConstant(1, dl, VecIdxTy));
29532 SDValue ShAmt = DAG.getConstant(
29533 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29534 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29535 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29536 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29537 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29538 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29539 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29541 // Store the value to a temporary stack slot.
29542 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29543 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29544 MachinePointerInfo());
29546 EVT ElementType = SrcVT.getVectorElementType();
29547 unsigned EltSize = ElementType.getSizeInBits() / 8;
29549 // Replace each use (extract) with a load of the appropriate element.
29550 for (unsigned i = 0; i < 4; ++i) {
29551 uint64_t Offset = EltSize * i;
29552 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29553 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29555 SDValue ScalarAddr =
29556 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29558 // Load the scalar.
29560 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29564 // Replace the extracts
29565 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29566 UE = Uses.end(); UI != UE; ++UI) {
29567 SDNode *Extract = *UI;
29569 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29570 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29573 // The replacement was made in place; don't return anything.
29577 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29578 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29579 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29580 // combineBasicSADPattern.
29581 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29582 TargetLowering::DAGCombinerInfo &DCI,
29583 const X86Subtarget &Subtarget) {
29584 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29587 /// If a vector select has an operand that is -1 or 0, try to simplify the
29588 /// select to a bitwise logic operation.
29590 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29591 TargetLowering::DAGCombinerInfo &DCI,
29592 const X86Subtarget &Subtarget) {
29593 SDValue Cond = N->getOperand(0);
29594 SDValue LHS = N->getOperand(1);
29595 SDValue RHS = N->getOperand(2);
29596 EVT VT = LHS.getValueType();
29597 EVT CondVT = Cond.getValueType();
29599 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29601 if (N->getOpcode() != ISD::VSELECT)
29604 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29606 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29607 // Check if the first operand is all zeros and Cond type is vXi1.
29608 // This situation only applies to avx512.
29609 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29610 CondVT.getVectorElementType() == MVT::i1) {
29611 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29612 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29613 DAG.getAllOnesConstant(DL, CondVT));
29614 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29615 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
29618 // To use the condition operand as a bitwise mask, it must have elements that
29619 // are the same size as the select elements. Ie, the condition operand must
29620 // have already been promoted from the IR select condition type <N x i1>.
29621 // Don't check if the types themselves are equal because that excludes
29622 // vector floating-point selects.
29623 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29626 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29627 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29629 // Try to invert the condition if true value is not all 1s and false value is
29631 if (!TValIsAllOnes && !FValIsAllZeros &&
29632 // Check if the selector will be produced by CMPP*/PCMP*.
29633 Cond.getOpcode() == ISD::SETCC &&
29634 // Check if SETCC has already been promoted.
29635 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29637 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29638 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29640 if (TValIsAllZeros || FValIsAllOnes) {
29641 SDValue CC = Cond.getOperand(2);
29642 ISD::CondCode NewCC =
29643 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29644 Cond.getOperand(0).getValueType().isInteger());
29645 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29647 std::swap(LHS, RHS);
29648 TValIsAllOnes = FValIsAllOnes;
29649 FValIsAllZeros = TValIsAllZeros;
29653 // vselect Cond, 111..., 000... -> Cond
29654 if (TValIsAllOnes && FValIsAllZeros)
29655 return DAG.getBitcast(VT, Cond);
29657 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29660 // vselect Cond, 111..., X -> or Cond, X
29661 if (TValIsAllOnes) {
29662 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29663 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29664 return DAG.getBitcast(VT, Or);
29667 // vselect Cond, X, 000... -> and Cond, X
29668 if (FValIsAllZeros) {
29669 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29670 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29671 return DAG.getBitcast(VT, And);
29677 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29678 SDValue Cond = N->getOperand(0);
29679 SDValue LHS = N->getOperand(1);
29680 SDValue RHS = N->getOperand(2);
29683 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29684 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29685 if (!TrueC || !FalseC)
29688 // Don't do this for crazy integer types.
29689 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29692 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29693 // so that TrueC (the true value) is larger than FalseC.
29694 bool NeedsCondInvert = false;
29695 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29696 // Efficiently invertible.
29697 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29698 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29699 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29700 NeedsCondInvert = true;
29701 std::swap(TrueC, FalseC);
29704 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29705 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29706 if (NeedsCondInvert) // Invert the condition if needed.
29707 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29708 DAG.getConstant(1, DL, Cond.getValueType()));
29710 // Zero extend the condition if needed.
29711 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29713 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29714 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29715 DAG.getConstant(ShAmt, DL, MVT::i8));
29718 // Optimize cases that will turn into an LEA instruction. This requires
29719 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29720 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29721 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29722 if (N->getValueType(0) == MVT::i32)
29723 Diff = (unsigned)Diff;
29725 bool isFastMultiplier = false;
29727 switch ((unsigned char)Diff) {
29730 case 1: // result = add base, cond
29731 case 2: // result = lea base( , cond*2)
29732 case 3: // result = lea base(cond, cond*2)
29733 case 4: // result = lea base( , cond*4)
29734 case 5: // result = lea base(cond, cond*4)
29735 case 8: // result = lea base( , cond*8)
29736 case 9: // result = lea base(cond, cond*8)
29737 isFastMultiplier = true;
29742 if (isFastMultiplier) {
29743 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29744 if (NeedsCondInvert) // Invert the condition if needed.
29745 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29746 DAG.getConstant(1, DL, Cond.getValueType()));
29748 // Zero extend the condition if needed.
29749 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29750 // Scale the condition by the difference.
29752 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29753 DAG.getConstant(Diff, DL, Cond.getValueType()));
29755 // Add the base if non-zero.
29756 if (FalseC->getAPIntValue() != 0)
29757 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29758 SDValue(FalseC, 0));
29766 // If this is a bitcasted op that can be represented as another type, push the
29767 // the bitcast to the inputs. This allows more opportunities for pattern
29768 // matching masked instructions. This is called when we know that the operation
29769 // is used as one of the inputs of a vselect.
29770 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29771 TargetLowering::DAGCombinerInfo &DCI) {
29772 // Make sure we have a bitcast.
29773 if (OrigOp.getOpcode() != ISD::BITCAST)
29776 SDValue Op = OrigOp.getOperand(0);
29778 // If the operation is used by anything other than the bitcast, we shouldn't
29779 // do this combine as that would replicate the operation.
29780 if (!Op.hasOneUse())
29783 MVT VT = OrigOp.getSimpleValueType();
29784 MVT EltVT = VT.getVectorElementType();
29785 SDLoc DL(Op.getNode());
29787 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29789 Op0 = DAG.getBitcast(VT, Op0);
29790 DCI.AddToWorklist(Op0.getNode());
29791 Op1 = DAG.getBitcast(VT, Op1);
29792 DCI.AddToWorklist(Op1.getNode());
29793 DCI.CombineTo(OrigOp.getNode(),
29794 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29798 unsigned Opcode = Op.getOpcode();
29800 case X86ISD::PALIGNR:
29801 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29802 if (!VT.is128BitVector())
29804 Opcode = X86ISD::VALIGN;
29806 case X86ISD::VALIGN: {
29807 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29809 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29810 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29811 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29812 unsigned EltSize = EltVT.getSizeInBits();
29813 // Make sure we can represent the same shift with the new VT.
29814 if ((ShiftAmt % EltSize) != 0)
29816 Imm = ShiftAmt / EltSize;
29817 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29818 DAG.getConstant(Imm, DL, MVT::i8));
29820 case X86ISD::SHUF128: {
29821 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29823 // Only change element size, not type.
29824 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29826 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29829 case ISD::INSERT_SUBVECTOR: {
29830 unsigned EltSize = EltVT.getSizeInBits();
29831 if (EltSize != 32 && EltSize != 64)
29833 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29834 // Only change element size, not type.
29835 if (EltVT.isInteger() != OpEltVT.isInteger())
29837 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29838 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29839 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29840 DCI.AddToWorklist(Op0.getNode());
29841 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29842 SDValue Op1 = Op.getOperand(1);
29843 MVT Op1VT = MVT::getVectorVT(EltVT,
29844 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29845 Op1 = DAG.getBitcast(Op1VT, Op1);
29846 DCI.AddToWorklist(Op1.getNode());
29847 DCI.CombineTo(OrigOp.getNode(),
29848 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29849 DAG.getIntPtrConstant(Imm, DL)));
29852 case ISD::EXTRACT_SUBVECTOR: {
29853 unsigned EltSize = EltVT.getSizeInBits();
29854 if (EltSize != 32 && EltSize != 64)
29856 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29857 // Only change element size, not type.
29858 if (EltVT.isInteger() != OpEltVT.isInteger())
29860 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29861 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29862 // Op0 needs to be bitcasted to a larger vector with the same element type.
29863 SDValue Op0 = Op.getOperand(0);
29864 MVT Op0VT = MVT::getVectorVT(EltVT,
29865 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29866 Op0 = DAG.getBitcast(Op0VT, Op0);
29867 DCI.AddToWorklist(Op0.getNode());
29868 DCI.CombineTo(OrigOp.getNode(),
29869 DAG.getNode(Opcode, DL, VT, Op0,
29870 DAG.getIntPtrConstant(Imm, DL)));
29873 case X86ISD::SUBV_BROADCAST: {
29874 unsigned EltSize = EltVT.getSizeInBits();
29875 if (EltSize != 32 && EltSize != 64)
29877 // Only change element size, not type.
29878 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29880 SDValue Op0 = Op.getOperand(0);
29881 MVT Op0VT = MVT::getVectorVT(EltVT,
29882 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29883 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29884 DCI.AddToWorklist(Op0.getNode());
29885 DCI.CombineTo(OrigOp.getNode(),
29886 DAG.getNode(Opcode, DL, VT, Op0));
29894 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29895 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29896 TargetLowering::DAGCombinerInfo &DCI,
29897 const X86Subtarget &Subtarget) {
29899 SDValue Cond = N->getOperand(0);
29900 // Get the LHS/RHS of the select.
29901 SDValue LHS = N->getOperand(1);
29902 SDValue RHS = N->getOperand(2);
29903 EVT VT = LHS.getValueType();
29904 EVT CondVT = Cond.getValueType();
29905 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29907 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29908 // instructions match the semantics of the common C idiom x<y?x:y but not
29909 // x<=y?x:y, because of how they handle negative zero (which can be
29910 // ignored in unsafe-math mode).
29911 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29912 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29913 VT != MVT::f80 && VT != MVT::f128 &&
29914 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29915 (Subtarget.hasSSE2() ||
29916 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29917 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29919 unsigned Opcode = 0;
29920 // Check for x CC y ? x : y.
29921 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29922 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29926 // Converting this to a min would handle NaNs incorrectly, and swapping
29927 // the operands would cause it to handle comparisons between positive
29928 // and negative zero incorrectly.
29929 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29930 if (!DAG.getTarget().Options.UnsafeFPMath &&
29931 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29933 std::swap(LHS, RHS);
29935 Opcode = X86ISD::FMIN;
29938 // Converting this to a min would handle comparisons between positive
29939 // and negative zero incorrectly.
29940 if (!DAG.getTarget().Options.UnsafeFPMath &&
29941 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29943 Opcode = X86ISD::FMIN;
29946 // Converting this to a min would handle both negative zeros and NaNs
29947 // incorrectly, but we can swap the operands to fix both.
29948 std::swap(LHS, RHS);
29952 Opcode = X86ISD::FMIN;
29956 // Converting this to a max would handle comparisons between positive
29957 // and negative zero incorrectly.
29958 if (!DAG.getTarget().Options.UnsafeFPMath &&
29959 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29961 Opcode = X86ISD::FMAX;
29964 // Converting this to a max would handle NaNs incorrectly, and swapping
29965 // the operands would cause it to handle comparisons between positive
29966 // and negative zero incorrectly.
29967 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29968 if (!DAG.getTarget().Options.UnsafeFPMath &&
29969 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29971 std::swap(LHS, RHS);
29973 Opcode = X86ISD::FMAX;
29976 // Converting this to a max would handle both negative zeros and NaNs
29977 // incorrectly, but we can swap the operands to fix both.
29978 std::swap(LHS, RHS);
29982 Opcode = X86ISD::FMAX;
29985 // Check for x CC y ? y : x -- a min/max with reversed arms.
29986 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29987 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29991 // Converting this to a min would handle comparisons between positive
29992 // and negative zero incorrectly, and swapping the operands would
29993 // cause it to handle NaNs incorrectly.
29994 if (!DAG.getTarget().Options.UnsafeFPMath &&
29995 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29996 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29998 std::swap(LHS, RHS);
30000 Opcode = X86ISD::FMIN;
30003 // Converting this to a min would handle NaNs incorrectly.
30004 if (!DAG.getTarget().Options.UnsafeFPMath &&
30005 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30007 Opcode = X86ISD::FMIN;
30010 // Converting this to a min would handle both negative zeros and NaNs
30011 // incorrectly, but we can swap the operands to fix both.
30012 std::swap(LHS, RHS);
30016 Opcode = X86ISD::FMIN;
30020 // Converting this to a max would handle NaNs incorrectly.
30021 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30023 Opcode = X86ISD::FMAX;
30026 // Converting this to a max would handle comparisons between positive
30027 // and negative zero incorrectly, and swapping the operands would
30028 // cause it to handle NaNs incorrectly.
30029 if (!DAG.getTarget().Options.UnsafeFPMath &&
30030 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30031 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30033 std::swap(LHS, RHS);
30035 Opcode = X86ISD::FMAX;
30038 // Converting this to a max would handle both negative zeros and NaNs
30039 // incorrectly, but we can swap the operands to fix both.
30040 std::swap(LHS, RHS);
30044 Opcode = X86ISD::FMAX;
30050 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30053 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30054 // lowering on KNL. In this case we convert it to
30055 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30056 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30057 // Since SKX these selects have a proper lowering.
30058 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30059 CondVT.getVectorElementType() == MVT::i1 &&
30060 (VT.is128BitVector() || VT.is256BitVector()) &&
30061 (VT.getVectorElementType() == MVT::i8 ||
30062 VT.getVectorElementType() == MVT::i16) &&
30063 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30064 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30065 DCI.AddToWorklist(Cond.getNode());
30066 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30069 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30072 // Canonicalize max and min:
30073 // (x > y) ? x : y -> (x >= y) ? x : y
30074 // (x < y) ? x : y -> (x <= y) ? x : y
30075 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30076 // the need for an extra compare
30077 // against zero. e.g.
30078 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30080 // testl %edi, %edi
30082 // cmovgl %edi, %eax
30086 // cmovsl %eax, %edi
30087 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30088 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30089 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30090 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30095 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30096 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30097 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30098 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
30103 // Early exit check
30104 if (!TLI.isTypeLegal(VT))
30107 // Match VSELECTs into subs with unsigned saturation.
30108 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30109 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30110 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30111 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30112 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30114 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30115 // left side invert the predicate to simplify logic below.
30117 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30119 CC = ISD::getSetCCInverse(CC, true);
30120 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30124 if (Other.getNode() && Other->getNumOperands() == 2 &&
30125 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30126 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30127 SDValue CondRHS = Cond->getOperand(1);
30129 // Look for a general sub with unsigned saturation first.
30130 // x >= y ? x-y : 0 --> subus x, y
30131 // x > y ? x-y : 0 --> subus x, y
30132 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30133 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30134 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30136 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30137 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30138 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30139 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30140 // If the RHS is a constant we have to reverse the const
30141 // canonicalization.
30142 // x > C-1 ? x+-C : 0 --> subus x, C
30143 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30144 CondRHSConst->getAPIntValue() ==
30145 (-OpRHSConst->getAPIntValue() - 1))
30146 return DAG.getNode(
30147 X86ISD::SUBUS, DL, VT, OpLHS,
30148 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30150 // Another special case: If C was a sign bit, the sub has been
30151 // canonicalized into a xor.
30152 // FIXME: Would it be better to use computeKnownBits to determine
30153 // whether it's safe to decanonicalize the xor?
30154 // x s< 0 ? x^C : 0 --> subus x, C
30155 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30156 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30157 OpRHSConst->getAPIntValue().isSignMask())
30158 // Note that we have to rebuild the RHS constant here to ensure we
30159 // don't rely on particular values of undef lanes.
30160 return DAG.getNode(
30161 X86ISD::SUBUS, DL, VT, OpLHS,
30162 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30167 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30170 // If this is a *dynamic* select (non-constant condition) and we can match
30171 // this node with one of the variable blend instructions, restructure the
30172 // condition so that blends can use the high (sign) bit of each element and
30173 // use SimplifyDemandedBits to simplify the condition operand.
30174 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30175 !DCI.isBeforeLegalize() &&
30176 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30177 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30179 // Don't optimize vector selects that map to mask-registers.
30183 // We can only handle the cases where VSELECT is directly legal on the
30184 // subtarget. We custom lower VSELECT nodes with constant conditions and
30185 // this makes it hard to see whether a dynamic VSELECT will correctly
30186 // lower, so we both check the operation's status and explicitly handle the
30187 // cases where a *dynamic* blend will fail even though a constant-condition
30188 // blend could be custom lowered.
30189 // FIXME: We should find a better way to handle this class of problems.
30190 // Potentially, we should combine constant-condition vselect nodes
30191 // pre-legalization into shuffles and not mark as many types as custom
30193 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30195 // FIXME: We don't support i16-element blends currently. We could and
30196 // should support them by making *all* the bits in the condition be set
30197 // rather than just the high bit and using an i8-element blend.
30198 if (VT.getVectorElementType() == MVT::i16)
30200 // Dynamic blending was only available from SSE4.1 onward.
30201 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30203 // Byte blends are only available in AVX2
30204 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30207 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30208 APInt DemandedMask(APInt::getSignMask(BitWidth));
30209 APInt KnownZero, KnownOne;
30210 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30211 DCI.isBeforeLegalizeOps());
30212 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30213 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
30215 // If we changed the computation somewhere in the DAG, this change will
30216 // affect all users of Cond. Make sure it is fine and update all the nodes
30217 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30218 // perform wrong optimizations as we messed with the actual expectation
30219 // for the vector boolean values.
30220 if (Cond != TLO.Old) {
30221 // Check all uses of the condition operand to check whether it will be
30222 // consumed by non-BLEND instructions. Those may require that all bits
30223 // are set properly.
30224 for (SDNode *U : Cond->uses()) {
30225 // TODO: Add other opcodes eventually lowered into BLEND.
30226 if (U->getOpcode() != ISD::VSELECT)
30230 // Update all users of the condition before committing the change, so
30231 // that the VSELECT optimizations that expect the correct vector boolean
30232 // value will not be triggered.
30233 for (SDNode *U : Cond->uses()) {
30234 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30235 U->getValueType(0), Cond, U->getOperand(1),
30237 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30239 DCI.CommitTargetLoweringOpt(TLO);
30242 // Only Cond (rather than other nodes in the computation chain) was
30243 // changed. Change the condition just for N to keep the opportunity to
30244 // optimize all other users their own way.
30245 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30246 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30251 // Look for vselects with LHS/RHS being bitcasted from an operation that
30252 // can be executed on another type. Push the bitcast to the inputs of
30253 // the operation. This exposes opportunities for using masking instructions.
30254 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30255 CondVT.getVectorElementType() == MVT::i1) {
30256 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30257 return SDValue(N, 0);
30258 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30259 return SDValue(N, 0);
30266 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30268 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30269 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30270 /// Note that this is only legal for some op/cc combinations.
30271 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30272 SelectionDAG &DAG) {
30273 // This combine only operates on CMP-like nodes.
30274 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30275 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30278 // Can't replace the cmp if it has more uses than the one we're looking at.
30279 // FIXME: We would like to be able to handle this, but would need to make sure
30280 // all uses were updated.
30281 if (!Cmp.hasOneUse())
30284 // This only applies to variations of the common case:
30285 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30286 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30287 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30288 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30289 // Using the proper condcodes (see below), overflow is checked for.
30291 // FIXME: We can generalize both constraints:
30292 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30294 // if the result is compared.
30296 SDValue CmpLHS = Cmp.getOperand(0);
30297 SDValue CmpRHS = Cmp.getOperand(1);
30299 if (!CmpLHS.hasOneUse())
30302 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30303 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30306 const unsigned Opc = CmpLHS.getOpcode();
30308 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30311 SDValue OpRHS = CmpLHS.getOperand(2);
30312 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30316 APInt Addend = OpRHSC->getAPIntValue();
30317 if (Opc == ISD::ATOMIC_LOAD_SUB)
30320 if (CC == X86::COND_S && Addend == 1)
30322 else if (CC == X86::COND_NS && Addend == 1)
30324 else if (CC == X86::COND_G && Addend == -1)
30326 else if (CC == X86::COND_LE && Addend == -1)
30331 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30332 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30333 DAG.getUNDEF(CmpLHS.getValueType()));
30334 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30338 // Check whether a boolean test is testing a boolean value generated by
30339 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30342 // Simplify the following patterns:
30343 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30344 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30345 // to (Op EFLAGS Cond)
30347 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30348 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30349 // to (Op EFLAGS !Cond)
30351 // where Op could be BRCOND or CMOV.
30353 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30354 // This combine only operates on CMP-like nodes.
30355 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30356 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30359 // Quit if not used as a boolean value.
30360 if (CC != X86::COND_E && CC != X86::COND_NE)
30363 // Check CMP operands. One of them should be 0 or 1 and the other should be
30364 // an SetCC or extended from it.
30365 SDValue Op1 = Cmp.getOperand(0);
30366 SDValue Op2 = Cmp.getOperand(1);
30369 const ConstantSDNode* C = nullptr;
30370 bool needOppositeCond = (CC == X86::COND_E);
30371 bool checkAgainstTrue = false; // Is it a comparison against 1?
30373 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30375 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30377 else // Quit if all operands are not constants.
30380 if (C->getZExtValue() == 1) {
30381 needOppositeCond = !needOppositeCond;
30382 checkAgainstTrue = true;
30383 } else if (C->getZExtValue() != 0)
30384 // Quit if the constant is neither 0 or 1.
30387 bool truncatedToBoolWithAnd = false;
30388 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30389 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30390 SetCC.getOpcode() == ISD::TRUNCATE ||
30391 SetCC.getOpcode() == ISD::AND) {
30392 if (SetCC.getOpcode() == ISD::AND) {
30394 if (isOneConstant(SetCC.getOperand(0)))
30396 if (isOneConstant(SetCC.getOperand(1)))
30400 SetCC = SetCC.getOperand(OpIdx);
30401 truncatedToBoolWithAnd = true;
30403 SetCC = SetCC.getOperand(0);
30406 switch (SetCC.getOpcode()) {
30407 case X86ISD::SETCC_CARRY:
30408 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30409 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30410 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30411 // truncated to i1 using 'and'.
30412 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30414 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30415 "Invalid use of SETCC_CARRY!");
30417 case X86ISD::SETCC:
30418 // Set the condition code or opposite one if necessary.
30419 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30420 if (needOppositeCond)
30421 CC = X86::GetOppositeBranchCondition(CC);
30422 return SetCC.getOperand(1);
30423 case X86ISD::CMOV: {
30424 // Check whether false/true value has canonical one, i.e. 0 or 1.
30425 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30426 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30427 // Quit if true value is not a constant.
30430 // Quit if false value is not a constant.
30432 SDValue Op = SetCC.getOperand(0);
30433 // Skip 'zext' or 'trunc' node.
30434 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30435 Op.getOpcode() == ISD::TRUNCATE)
30436 Op = Op.getOperand(0);
30437 // A special case for rdrand/rdseed, where 0 is set if false cond is
30439 if ((Op.getOpcode() != X86ISD::RDRAND &&
30440 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30443 // Quit if false value is not the constant 0 or 1.
30444 bool FValIsFalse = true;
30445 if (FVal && FVal->getZExtValue() != 0) {
30446 if (FVal->getZExtValue() != 1)
30448 // If FVal is 1, opposite cond is needed.
30449 needOppositeCond = !needOppositeCond;
30450 FValIsFalse = false;
30452 // Quit if TVal is not the constant opposite of FVal.
30453 if (FValIsFalse && TVal->getZExtValue() != 1)
30455 if (!FValIsFalse && TVal->getZExtValue() != 0)
30457 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30458 if (needOppositeCond)
30459 CC = X86::GetOppositeBranchCondition(CC);
30460 return SetCC.getOperand(3);
30467 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30469 /// (X86or (X86setcc) (X86setcc))
30470 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30471 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30472 X86::CondCode &CC1, SDValue &Flags,
30474 if (Cond->getOpcode() == X86ISD::CMP) {
30475 if (!isNullConstant(Cond->getOperand(1)))
30478 Cond = Cond->getOperand(0);
30483 SDValue SetCC0, SetCC1;
30484 switch (Cond->getOpcode()) {
30485 default: return false;
30492 SetCC0 = Cond->getOperand(0);
30493 SetCC1 = Cond->getOperand(1);
30497 // Make sure we have SETCC nodes, using the same flags value.
30498 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30499 SetCC1.getOpcode() != X86ISD::SETCC ||
30500 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30503 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30504 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30505 Flags = SetCC0->getOperand(1);
30509 /// Optimize an EFLAGS definition used according to the condition code \p CC
30510 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30511 /// uses of chain values.
30512 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30513 SelectionDAG &DAG) {
30514 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30516 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30519 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30520 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30521 TargetLowering::DAGCombinerInfo &DCI,
30522 const X86Subtarget &Subtarget) {
30525 // If the flag operand isn't dead, don't touch this CMOV.
30526 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30529 SDValue FalseOp = N->getOperand(0);
30530 SDValue TrueOp = N->getOperand(1);
30531 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30532 SDValue Cond = N->getOperand(3);
30534 if (CC == X86::COND_E || CC == X86::COND_NE) {
30535 switch (Cond.getOpcode()) {
30539 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30540 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30541 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30545 // Try to simplify the EFLAGS and condition code operands.
30546 // We can't always do this as FCMOV only supports a subset of X86 cond.
30547 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30548 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30549 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30551 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30555 // If this is a select between two integer constants, try to do some
30556 // optimizations. Note that the operands are ordered the opposite of SELECT
30558 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30559 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30560 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30561 // larger than FalseC (the false value).
30562 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30563 CC = X86::GetOppositeBranchCondition(CC);
30564 std::swap(TrueC, FalseC);
30565 std::swap(TrueOp, FalseOp);
30568 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30569 // This is efficient for any integer data type (including i8/i16) and
30571 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30572 Cond = getSETCC(CC, Cond, DL, DAG);
30574 // Zero extend the condition if needed.
30575 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30577 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30578 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30579 DAG.getConstant(ShAmt, DL, MVT::i8));
30580 if (N->getNumValues() == 2) // Dead flag value?
30581 return DCI.CombineTo(N, Cond, SDValue());
30585 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30586 // for any integer data type, including i8/i16.
30587 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30588 Cond = getSETCC(CC, Cond, DL, DAG);
30590 // Zero extend the condition if needed.
30591 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30592 FalseC->getValueType(0), Cond);
30593 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30594 SDValue(FalseC, 0));
30596 if (N->getNumValues() == 2) // Dead flag value?
30597 return DCI.CombineTo(N, Cond, SDValue());
30601 // Optimize cases that will turn into an LEA instruction. This requires
30602 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30603 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30604 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30605 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30607 bool isFastMultiplier = false;
30609 switch ((unsigned char)Diff) {
30611 case 1: // result = add base, cond
30612 case 2: // result = lea base( , cond*2)
30613 case 3: // result = lea base(cond, cond*2)
30614 case 4: // result = lea base( , cond*4)
30615 case 5: // result = lea base(cond, cond*4)
30616 case 8: // result = lea base( , cond*8)
30617 case 9: // result = lea base(cond, cond*8)
30618 isFastMultiplier = true;
30623 if (isFastMultiplier) {
30624 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30625 Cond = getSETCC(CC, Cond, DL ,DAG);
30626 // Zero extend the condition if needed.
30627 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30629 // Scale the condition by the difference.
30631 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30632 DAG.getConstant(Diff, DL, Cond.getValueType()));
30634 // Add the base if non-zero.
30635 if (FalseC->getAPIntValue() != 0)
30636 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30637 SDValue(FalseC, 0));
30638 if (N->getNumValues() == 2) // Dead flag value?
30639 return DCI.CombineTo(N, Cond, SDValue());
30646 // Handle these cases:
30647 // (select (x != c), e, c) -> select (x != c), e, x),
30648 // (select (x == c), c, e) -> select (x == c), x, e)
30649 // where the c is an integer constant, and the "select" is the combination
30650 // of CMOV and CMP.
30652 // The rationale for this change is that the conditional-move from a constant
30653 // needs two instructions, however, conditional-move from a register needs
30654 // only one instruction.
30656 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30657 // some instruction-combining opportunities. This opt needs to be
30658 // postponed as late as possible.
30660 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30661 // the DCI.xxxx conditions are provided to postpone the optimization as
30662 // late as possible.
30664 ConstantSDNode *CmpAgainst = nullptr;
30665 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30666 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30667 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30669 if (CC == X86::COND_NE &&
30670 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30671 CC = X86::GetOppositeBranchCondition(CC);
30672 std::swap(TrueOp, FalseOp);
30675 if (CC == X86::COND_E &&
30676 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30677 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30678 DAG.getConstant(CC, DL, MVT::i8), Cond };
30679 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30684 // Fold and/or of setcc's to double CMOV:
30685 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30686 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30688 // This combine lets us generate:
30689 // cmovcc1 (jcc1 if we don't have CMOV)
30695 // cmovne (jne if we don't have CMOV)
30696 // When we can't use the CMOV instruction, it might increase branch
30698 // When we can use CMOV, or when there is no mispredict, this improves
30699 // throughput and reduces register pressure.
30701 if (CC == X86::COND_NE) {
30703 X86::CondCode CC0, CC1;
30705 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30707 std::swap(FalseOp, TrueOp);
30708 CC0 = X86::GetOppositeBranchCondition(CC0);
30709 CC1 = X86::GetOppositeBranchCondition(CC1);
30712 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30714 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30715 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30716 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30717 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30725 /// Different mul shrinking modes.
30726 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30728 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30729 EVT VT = N->getOperand(0).getValueType();
30730 if (VT.getScalarSizeInBits() != 32)
30733 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30734 unsigned SignBits[2] = {1, 1};
30735 bool IsPositive[2] = {false, false};
30736 for (unsigned i = 0; i < 2; i++) {
30737 SDValue Opd = N->getOperand(i);
30739 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30740 // compute signbits for it separately.
30741 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30742 // For anyextend, it is safe to assume an appropriate number of leading
30744 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30746 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30751 IsPositive[i] = true;
30752 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30753 // All the operands of BUILD_VECTOR need to be int constant.
30754 // Find the smallest value range which all the operands belong to.
30756 IsPositive[i] = true;
30757 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30758 if (SubOp.isUndef())
30760 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30763 APInt IntVal = CN->getAPIntValue();
30764 if (IntVal.isNegative())
30765 IsPositive[i] = false;
30766 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30769 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30770 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30771 IsPositive[i] = true;
30775 bool AllPositive = IsPositive[0] && IsPositive[1];
30776 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30777 // When ranges are from -128 ~ 127, use MULS8 mode.
30778 if (MinSignBits >= 25)
30780 // When ranges are from 0 ~ 255, use MULU8 mode.
30781 else if (AllPositive && MinSignBits >= 24)
30783 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30784 else if (MinSignBits >= 17)
30786 // When ranges are from 0 ~ 65535, use MULU16 mode.
30787 else if (AllPositive && MinSignBits >= 16)
30794 /// When the operands of vector mul are extended from smaller size values,
30795 /// like i8 and i16, the type of mul may be shrinked to generate more
30796 /// efficient code. Two typical patterns are handled:
30798 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30799 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30800 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30801 /// %5 = mul <N x i32> %2, %4
30804 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30805 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30806 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30807 /// %5 = mul <N x i32> %2, %4
30809 /// There are four mul shrinking modes:
30810 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30811 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30812 /// generate pmullw+sext32 for it (MULS8 mode).
30813 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30814 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30815 /// generate pmullw+zext32 for it (MULU8 mode).
30816 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30817 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30818 /// generate pmullw+pmulhw for it (MULS16 mode).
30819 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30820 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30821 /// generate pmullw+pmulhuw for it (MULU16 mode).
30822 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30823 const X86Subtarget &Subtarget) {
30824 // Check for legality
30825 // pmullw/pmulhw are not supported by SSE.
30826 if (!Subtarget.hasSSE2())
30829 // Check for profitability
30830 // pmulld is supported since SSE41. It is better to use pmulld
30831 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30833 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30834 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30838 if (!canReduceVMulWidth(N, DAG, Mode))
30842 SDValue N0 = N->getOperand(0);
30843 SDValue N1 = N->getOperand(1);
30844 EVT VT = N->getOperand(0).getValueType();
30845 unsigned RegSize = 128;
30846 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30848 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30849 // Shrink the operands of mul.
30850 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30851 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30853 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30854 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30855 // lower part is needed.
30856 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30857 if (Mode == MULU8 || Mode == MULS8) {
30858 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30861 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30862 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30863 // the higher part is also needed.
30864 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30865 ReducedVT, NewN0, NewN1);
30867 // Repack the lower part and higher part result of mul into a wider
30869 // Generate shuffle functioning as punpcklwd.
30870 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30871 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30872 ShuffleMask[2 * i] = i;
30873 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30876 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30877 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30878 // Generate shuffle functioning as punpckhwd.
30879 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30880 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30881 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30884 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30885 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30886 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30889 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30890 // to legalize the mul explicitly because implicit legalization for type
30891 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30892 // instructions which will not exist when we explicitly legalize it by
30893 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30894 // <4 x i16> undef).
30896 // Legalize the operands of mul.
30897 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30898 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30899 if ((RegSize % ReducedSizeInBits) != 0)
30902 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30903 DAG.getUNDEF(ReducedVT));
30905 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30907 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30909 if (Mode == MULU8 || Mode == MULS8) {
30910 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30912 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30914 // convert the type of mul result to VT.
30915 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30916 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30917 : ISD::SIGN_EXTEND_VECTOR_INREG,
30919 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30920 DAG.getIntPtrConstant(0, DL));
30922 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30923 // MULU16/MULS16, both parts are needed.
30924 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30925 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30926 OpsVT, NewN0, NewN1);
30928 // Repack the lower part and higher part result of mul into a wider
30929 // result. Make sure the type of mul result is VT.
30930 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30931 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30932 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30933 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30934 DAG.getIntPtrConstant(0, DL));
30939 /// Optimize a single multiply with constant into two operations in order to
30940 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
30941 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
30942 TargetLowering::DAGCombinerInfo &DCI,
30943 const X86Subtarget &Subtarget) {
30944 EVT VT = N->getValueType(0);
30945 if (DCI.isBeforeLegalize() && VT.isVector())
30946 return reduceVMULWidth(N, DAG, Subtarget);
30948 // An imul is usually smaller than the alternative sequence.
30949 if (DAG.getMachineFunction().getFunction()->optForMinSize())
30952 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
30955 if (VT != MVT::i64 && VT != MVT::i32)
30958 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
30961 uint64_t MulAmt = C->getZExtValue();
30962 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
30965 uint64_t MulAmt1 = 0;
30966 uint64_t MulAmt2 = 0;
30967 if ((MulAmt % 9) == 0) {
30969 MulAmt2 = MulAmt / 9;
30970 } else if ((MulAmt % 5) == 0) {
30972 MulAmt2 = MulAmt / 5;
30973 } else if ((MulAmt % 3) == 0) {
30975 MulAmt2 = MulAmt / 3;
30981 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
30983 if (isPowerOf2_64(MulAmt2) &&
30984 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
30985 // If second multiplifer is pow2, issue it first. We want the multiply by
30986 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30988 std::swap(MulAmt1, MulAmt2);
30990 if (isPowerOf2_64(MulAmt1))
30991 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30992 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30994 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30995 DAG.getConstant(MulAmt1, DL, VT));
30997 if (isPowerOf2_64(MulAmt2))
30998 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30999 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31001 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31002 DAG.getConstant(MulAmt2, DL, VT));
31006 assert(MulAmt != 0 &&
31007 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31008 "Both cases that could cause potential overflows should have "
31009 "already been handled.");
31010 int64_t SignMulAmt = C->getSExtValue();
31011 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31012 (SignMulAmt != -INT64_MAX)) {
31013 int NumSign = SignMulAmt > 0 ? 1 : -1;
31014 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31015 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31016 if (IsPowerOf2_64PlusOne) {
31017 // (mul x, 2^N + 1) => (add (shl x, N), x)
31018 NewMul = DAG.getNode(
31019 ISD::ADD, DL, VT, N->getOperand(0),
31020 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31021 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31023 } else if (IsPowerOf2_64MinusOne) {
31024 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31025 NewMul = DAG.getNode(
31027 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31028 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31032 // To negate, subtract the number from zero
31033 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31035 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31040 // Do not add new nodes to DAG combiner worklist.
31041 DCI.CombineTo(N, NewMul, false);
31046 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31047 SDValue N0 = N->getOperand(0);
31048 SDValue N1 = N->getOperand(1);
31049 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31050 EVT VT = N0.getValueType();
31052 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31053 // since the result of setcc_c is all zero's or all ones.
31054 if (VT.isInteger() && !VT.isVector() &&
31055 N1C && N0.getOpcode() == ISD::AND &&
31056 N0.getOperand(1).getOpcode() == ISD::Constant) {
31057 SDValue N00 = N0.getOperand(0);
31058 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31059 const APInt &ShAmt = N1C->getAPIntValue();
31060 Mask = Mask.shl(ShAmt);
31061 bool MaskOK = false;
31062 // We can handle cases concerning bit-widening nodes containing setcc_c if
31063 // we carefully interrogate the mask to make sure we are semantics
31065 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31066 // of the underlying setcc_c operation if the setcc_c was zero extended.
31067 // Consider the following example:
31068 // zext(setcc_c) -> i32 0x0000FFFF
31069 // c1 -> i32 0x0000FFFF
31070 // c2 -> i32 0x00000001
31071 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31072 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31073 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31075 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31076 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31078 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31079 N00.getOpcode() == ISD::ANY_EXTEND) &&
31080 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31081 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31083 if (MaskOK && Mask != 0) {
31085 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31089 // Hardware support for vector shifts is sparse which makes us scalarize the
31090 // vector operations in many cases. Also, on sandybridge ADD is faster than
31092 // (shl V, 1) -> add V,V
31093 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31094 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31095 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31096 // We shift all of the values by one. In many cases we do not have
31097 // hardware support for this operation. This is better expressed as an ADD
31099 if (N1SplatC->getAPIntValue() == 1)
31100 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31106 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31107 SDValue N0 = N->getOperand(0);
31108 SDValue N1 = N->getOperand(1);
31109 EVT VT = N0.getValueType();
31110 unsigned Size = VT.getSizeInBits();
31112 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31113 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31114 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31115 // depending on sign of (SarConst - [56,48,32,24,16])
31117 // sexts in X86 are MOVs. The MOVs have the same code size
31118 // as above SHIFTs (only SHIFT on 1 has lower code size).
31119 // However the MOVs have 2 advantages to a SHIFT:
31120 // 1. MOVs can write to a register that differs from source
31121 // 2. MOVs accept memory operands
31123 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31124 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31125 N0.getOperand(1).getOpcode() != ISD::Constant)
31128 SDValue N00 = N0.getOperand(0);
31129 SDValue N01 = N0.getOperand(1);
31130 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31131 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31132 EVT CVT = N1.getValueType();
31134 if (SarConst.isNegative())
31137 for (MVT SVT : MVT::integer_valuetypes()) {
31138 unsigned ShiftSize = SVT.getSizeInBits();
31139 // skipping types without corresponding sext/zext and
31140 // ShlConst that is not one of [56,48,32,24,16]
31141 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31145 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31146 SarConst = SarConst - (Size - ShiftSize);
31149 else if (SarConst.isNegative())
31150 return DAG.getNode(ISD::SHL, DL, VT, NN,
31151 DAG.getConstant(-SarConst, DL, CVT));
31153 return DAG.getNode(ISD::SRA, DL, VT, NN,
31154 DAG.getConstant(SarConst, DL, CVT));
31159 /// \brief Returns a vector of 0s if the node in input is a vector logical
31160 /// shift by a constant amount which is known to be bigger than or equal
31161 /// to the vector element size in bits.
31162 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31163 const X86Subtarget &Subtarget) {
31164 EVT VT = N->getValueType(0);
31166 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31167 (!Subtarget.hasInt256() ||
31168 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31171 SDValue Amt = N->getOperand(1);
31173 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31174 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31175 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31176 unsigned MaxAmount =
31177 VT.getSimpleVT().getScalarSizeInBits();
31179 // SSE2/AVX2 logical shifts always return a vector of 0s
31180 // if the shift amount is bigger than or equal to
31181 // the element size. The constant shift amount will be
31182 // encoded as a 8-bit immediate.
31183 if (ShiftAmt.trunc(8).uge(MaxAmount))
31184 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31190 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31191 TargetLowering::DAGCombinerInfo &DCI,
31192 const X86Subtarget &Subtarget) {
31193 if (N->getOpcode() == ISD::SHL)
31194 if (SDValue V = combineShiftLeft(N, DAG))
31197 if (N->getOpcode() == ISD::SRA)
31198 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31201 // Try to fold this logical shift into a zero vector.
31202 if (N->getOpcode() != ISD::SRA)
31203 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31209 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31210 TargetLowering::DAGCombinerInfo &DCI,
31211 const X86Subtarget &Subtarget) {
31212 unsigned Opcode = N->getOpcode();
31213 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31214 X86ISD::VSRLI == Opcode) &&
31215 "Unexpected shift opcode");
31216 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31217 EVT VT = N->getValueType(0);
31218 SDValue N0 = N->getOperand(0);
31219 SDValue N1 = N->getOperand(1);
31220 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31221 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31222 "Unexpected value type");
31224 // Out of range logical bit shifts are guaranteed to be zero.
31225 // Out of range arithmetic bit shifts splat the sign bit.
31226 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31227 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31229 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31231 ShiftVal = NumBitsPerElt - 1;
31234 // Shift N0 by zero -> N0.
31238 // Shift zero -> zero.
31239 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31240 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31242 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31243 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31244 // TODO - support other sra opcodes as needed.
31245 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31246 N0.getOpcode() == X86ISD::VSRAI)
31247 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31249 // We can decode 'whole byte' logical bit shifts as shuffles.
31250 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31252 SmallVector<int, 1> NonceMask; // Just a placeholder.
31253 NonceMask.push_back(0);
31254 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31255 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31257 return SDValue(); // This routine will use CombineTo to replace N.
31260 // Constant Folding.
31262 SmallVector<APInt, 32> EltBits;
31263 if (N->isOnlyUserOf(N0.getNode()) &&
31264 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31265 assert(EltBits.size() == VT.getVectorNumElements() &&
31266 "Unexpected shift value type");
31267 unsigned ShiftImm = ShiftVal.getZExtValue();
31268 for (APInt &Elt : EltBits) {
31269 if (X86ISD::VSHLI == Opcode)
31270 Elt = Elt.shl(ShiftImm);
31271 else if (X86ISD::VSRAI == Opcode)
31272 Elt = Elt.ashr(ShiftImm);
31274 Elt.lshrInPlace(ShiftImm);
31276 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31282 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31283 TargetLowering::DAGCombinerInfo &DCI,
31284 const X86Subtarget &Subtarget) {
31286 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31287 (N->getOpcode() == X86ISD::PINSRW &&
31288 N->getValueType(0) == MVT::v8i16)) &&
31289 "Unexpected vector insertion");
31291 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31293 SmallVector<int, 1> NonceMask; // Just a placeholder.
31294 NonceMask.push_back(0);
31295 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31296 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31301 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31302 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31303 /// OR -> CMPNEQSS.
31304 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31305 TargetLowering::DAGCombinerInfo &DCI,
31306 const X86Subtarget &Subtarget) {
31309 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31310 // we're requiring SSE2 for both.
31311 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31312 SDValue N0 = N->getOperand(0);
31313 SDValue N1 = N->getOperand(1);
31314 SDValue CMP0 = N0->getOperand(1);
31315 SDValue CMP1 = N1->getOperand(1);
31318 // The SETCCs should both refer to the same CMP.
31319 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31322 SDValue CMP00 = CMP0->getOperand(0);
31323 SDValue CMP01 = CMP0->getOperand(1);
31324 EVT VT = CMP00.getValueType();
31326 if (VT == MVT::f32 || VT == MVT::f64) {
31327 bool ExpectingFlags = false;
31328 // Check for any users that want flags:
31329 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31330 !ExpectingFlags && UI != UE; ++UI)
31331 switch (UI->getOpcode()) {
31336 ExpectingFlags = true;
31338 case ISD::CopyToReg:
31339 case ISD::SIGN_EXTEND:
31340 case ISD::ZERO_EXTEND:
31341 case ISD::ANY_EXTEND:
31345 if (!ExpectingFlags) {
31346 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31347 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31349 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31350 X86::CondCode tmp = cc0;
31355 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31356 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31357 // FIXME: need symbolic constants for these magic numbers.
31358 // See X86ATTInstPrinter.cpp:printSSECC().
31359 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31360 if (Subtarget.hasAVX512()) {
31361 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
31363 DAG.getConstant(x86cc, DL, MVT::i8));
31364 if (N->getValueType(0) != MVT::i1)
31365 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
31369 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31370 CMP00.getValueType(), CMP00, CMP01,
31371 DAG.getConstant(x86cc, DL,
31374 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31375 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31377 if (is64BitFP && !Subtarget.is64Bit()) {
31378 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31379 // 64-bit integer, since that's not a legal type. Since
31380 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31381 // bits, but can do this little dance to extract the lowest 32 bits
31382 // and work with those going forward.
31383 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31385 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31386 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31387 Vector32, DAG.getIntPtrConstant(0, DL));
31391 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31392 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31393 DAG.getConstant(1, DL, IntVT));
31394 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31396 return OneBitOfTruth;
31404 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31405 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31406 assert(N->getOpcode() == ISD::AND);
31408 EVT VT = N->getValueType(0);
31409 SDValue N0 = N->getOperand(0);
31410 SDValue N1 = N->getOperand(1);
31413 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31416 if (N0.getOpcode() == ISD::XOR &&
31417 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31418 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31420 if (N1.getOpcode() == ISD::XOR &&
31421 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31422 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31427 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31428 // register. In most cases we actually compare or select YMM-sized registers
31429 // and mixing the two types creates horrible code. This method optimizes
31430 // some of the transition sequences.
31431 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31432 TargetLowering::DAGCombinerInfo &DCI,
31433 const X86Subtarget &Subtarget) {
31434 EVT VT = N->getValueType(0);
31435 if (!VT.is256BitVector())
31438 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31439 N->getOpcode() == ISD::ZERO_EXTEND ||
31440 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31442 SDValue Narrow = N->getOperand(0);
31443 EVT NarrowVT = Narrow->getValueType(0);
31444 if (!NarrowVT.is128BitVector())
31447 if (Narrow->getOpcode() != ISD::XOR &&
31448 Narrow->getOpcode() != ISD::AND &&
31449 Narrow->getOpcode() != ISD::OR)
31452 SDValue N0 = Narrow->getOperand(0);
31453 SDValue N1 = Narrow->getOperand(1);
31456 // The Left side has to be a trunc.
31457 if (N0.getOpcode() != ISD::TRUNCATE)
31460 // The type of the truncated inputs.
31461 EVT WideVT = N0->getOperand(0)->getValueType(0);
31465 // The right side has to be a 'trunc' or a constant vector.
31466 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31467 ConstantSDNode *RHSConstSplat = nullptr;
31468 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31469 RHSConstSplat = RHSBV->getConstantSplatNode();
31470 if (!RHSTrunc && !RHSConstSplat)
31473 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31475 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31478 // Set N0 and N1 to hold the inputs to the new wide operation.
31479 N0 = N0->getOperand(0);
31480 if (RHSConstSplat) {
31481 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31482 SDValue(RHSConstSplat, 0));
31483 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31484 } else if (RHSTrunc) {
31485 N1 = N1->getOperand(0);
31488 // Generate the wide operation.
31489 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31490 unsigned Opcode = N->getOpcode();
31492 case ISD::ANY_EXTEND:
31494 case ISD::ZERO_EXTEND: {
31495 unsigned InBits = NarrowVT.getScalarSizeInBits();
31496 APInt Mask = APInt::getAllOnesValue(InBits);
31497 Mask = Mask.zext(VT.getScalarSizeInBits());
31498 return DAG.getNode(ISD::AND, DL, VT,
31499 Op, DAG.getConstant(Mask, DL, VT));
31501 case ISD::SIGN_EXTEND:
31502 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31503 Op, DAG.getValueType(NarrowVT));
31505 llvm_unreachable("Unexpected opcode");
31509 /// If both input operands of a logic op are being cast from floating point
31510 /// types, try to convert this into a floating point logic node to avoid
31511 /// unnecessary moves from SSE to integer registers.
31512 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31513 const X86Subtarget &Subtarget) {
31514 unsigned FPOpcode = ISD::DELETED_NODE;
31515 if (N->getOpcode() == ISD::AND)
31516 FPOpcode = X86ISD::FAND;
31517 else if (N->getOpcode() == ISD::OR)
31518 FPOpcode = X86ISD::FOR;
31519 else if (N->getOpcode() == ISD::XOR)
31520 FPOpcode = X86ISD::FXOR;
31522 assert(FPOpcode != ISD::DELETED_NODE &&
31523 "Unexpected input node for FP logic conversion");
31525 EVT VT = N->getValueType(0);
31526 SDValue N0 = N->getOperand(0);
31527 SDValue N1 = N->getOperand(1);
31529 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31530 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31531 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31532 SDValue N00 = N0.getOperand(0);
31533 SDValue N10 = N1.getOperand(0);
31534 EVT N00Type = N00.getValueType();
31535 EVT N10Type = N10.getValueType();
31536 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31537 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31538 return DAG.getBitcast(VT, FPLogic);
31544 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31545 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31546 /// with a shift-right to eliminate loading the vector constant mask value.
31547 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31548 const X86Subtarget &Subtarget) {
31549 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31550 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31551 EVT VT0 = Op0.getValueType();
31552 EVT VT1 = Op1.getValueType();
31554 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31558 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31559 !SplatVal.isMask())
31562 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31565 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31566 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31570 unsigned ShiftVal = SplatVal.countTrailingOnes();
31571 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31572 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31573 return DAG.getBitcast(N->getValueType(0), Shift);
31576 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31577 TargetLowering::DAGCombinerInfo &DCI,
31578 const X86Subtarget &Subtarget) {
31579 if (DCI.isBeforeLegalizeOps())
31582 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31585 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31588 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31591 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31594 EVT VT = N->getValueType(0);
31595 SDValue N0 = N->getOperand(0);
31596 SDValue N1 = N->getOperand(1);
31599 // Attempt to recursively combine a bitmask AND with shuffles.
31600 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31602 SmallVector<int, 1> NonceMask; // Just a placeholder.
31603 NonceMask.push_back(0);
31604 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31605 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31607 return SDValue(); // This routine will use CombineTo to replace N.
31610 // Create BEXTR instructions
31611 // BEXTR is ((X >> imm) & (2**size-1))
31612 if (VT != MVT::i32 && VT != MVT::i64)
31615 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31617 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31620 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31621 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31622 if (MaskNode && ShiftNode) {
31623 uint64_t Mask = MaskNode->getZExtValue();
31624 uint64_t Shift = ShiftNode->getZExtValue();
31625 if (isMask_64(Mask)) {
31626 uint64_t MaskSize = countPopulation(Mask);
31627 if (Shift + MaskSize <= VT.getSizeInBits())
31628 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31629 DAG.getConstant(Shift | (MaskSize << 8), DL,
31637 // (or (and (m, y), (pandn m, x)))
31639 // (vselect m, x, y)
31640 // As a special case, try to fold:
31641 // (or (and (m, (sub 0, x)), (pandn m, x)))
31643 // (sub (xor X, M), M)
31644 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31645 const X86Subtarget &Subtarget) {
31646 assert(N->getOpcode() == ISD::OR);
31648 SDValue N0 = N->getOperand(0);
31649 SDValue N1 = N->getOperand(1);
31650 EVT VT = N->getValueType(0);
31652 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
31654 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
31656 // Canonicalize pandn to RHS
31657 if (N0.getOpcode() == X86ISD::ANDNP)
31660 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31663 SDValue Mask = N1.getOperand(0);
31664 SDValue X = N1.getOperand(1);
31666 if (N0.getOperand(0) == Mask)
31667 Y = N0.getOperand(1);
31668 if (N0.getOperand(1) == Mask)
31669 Y = N0.getOperand(0);
31671 // Check to see if the mask appeared in both the AND and ANDNP.
31675 // Validate that X, Y, and Mask are bitcasts, and see through them.
31676 Mask = peekThroughBitcasts(Mask);
31677 X = peekThroughBitcasts(X);
31678 Y = peekThroughBitcasts(Y);
31680 EVT MaskVT = Mask.getValueType();
31682 // Validate that the Mask operand is a vector sra node.
31683 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
31684 // there is no psrai.b
31685 unsigned EltBits = MaskVT.getScalarSizeInBits();
31686 unsigned SraAmt = ~0;
31687 if (Mask.getOpcode() == ISD::SRA) {
31688 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
31689 if (auto *AmtConst = AmtBV->getConstantSplatNode())
31690 SraAmt = AmtConst->getZExtValue();
31691 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
31692 SDValue SraC = Mask.getOperand(1);
31693 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
31695 if ((SraAmt + 1) != EltBits)
31701 // (or (and (M, (sub 0, X)), (pandn M, X)))
31702 // which is a special case of vselect:
31703 // (vselect M, (sub 0, X), X)
31705 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31706 // We know that, if fNegate is 0 or 1:
31707 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31709 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31710 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31711 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31712 // This lets us transform our vselect to:
31713 // (add (xor X, M), (and M, 1))
31715 // (sub (xor X, M), M)
31716 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
31717 auto IsNegV = [](SDNode *N, SDValue V) {
31718 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31719 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31722 if (IsNegV(Y.getNode(), X))
31724 else if (IsNegV(X.getNode(), Y))
31728 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
31729 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31730 SDValue SubOp2 = Mask;
31732 // If the negate was on the false side of the select, then
31733 // the operands of the SUB need to be swapped. PR 27251.
31734 // This is because the pattern being matched above is
31735 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31736 // but if the pattern matched was
31737 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31738 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31739 // pattern also needs to be a negation of the replacement pattern above.
31740 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31741 // sub accomplishes the negation of the replacement pattern.
31743 std::swap(SubOp1, SubOp2);
31745 return DAG.getBitcast(VT,
31746 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
31750 // PBLENDVB is only available on SSE 4.1.
31751 if (!Subtarget.hasSSE41())
31754 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31756 X = DAG.getBitcast(BlendVT, X);
31757 Y = DAG.getBitcast(BlendVT, Y);
31758 Mask = DAG.getBitcast(BlendVT, Mask);
31759 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
31760 return DAG.getBitcast(VT, Mask);
31763 // Helper function for combineOrCmpEqZeroToCtlzSrl
31767 // srl(ctlz x), log2(bitsize(x))
31768 // Input pattern is checked by caller.
31769 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31770 SelectionDAG &DAG) {
31771 SDValue Cmp = Op.getOperand(1);
31772 EVT VT = Cmp.getOperand(0).getValueType();
31773 unsigned Log2b = Log2_32(VT.getSizeInBits());
31775 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31776 // The result of the shift is true or false, and on X86, the 32-bit
31777 // encoding of shr and lzcnt is more desirable.
31778 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31779 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31780 DAG.getConstant(Log2b, dl, VT));
31781 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31784 // Try to transform:
31785 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31787 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31788 // Will also attempt to match more generic cases, eg:
31789 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31790 // Only applies if the target supports the FastLZCNT feature.
31791 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31792 TargetLowering::DAGCombinerInfo &DCI,
31793 const X86Subtarget &Subtarget) {
31794 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31797 auto isORCandidate = [](SDValue N) {
31798 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31801 // Check the zero extend is extending to 32-bit or more. The code generated by
31802 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31803 // instructions to clear the upper bits.
31804 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31805 !isORCandidate(N->getOperand(0)))
31808 // Check the node matches: setcc(eq, cmp 0)
31809 auto isSetCCCandidate = [](SDValue N) {
31810 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31811 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31812 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31813 isNullConstant(N->getOperand(1).getOperand(1)) &&
31814 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31817 SDNode *OR = N->getOperand(0).getNode();
31818 SDValue LHS = OR->getOperand(0);
31819 SDValue RHS = OR->getOperand(1);
31821 // Save nodes matching or(or, setcc(eq, cmp 0)).
31822 SmallVector<SDNode *, 2> ORNodes;
31823 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31824 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31825 ORNodes.push_back(OR);
31826 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31827 LHS = OR->getOperand(0);
31828 RHS = OR->getOperand(1);
31831 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31832 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31833 !isORCandidate(SDValue(OR, 0)))
31836 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31838 // or(srl(ctlz),srl(ctlz)).
31839 // The dag combiner can then fold it into:
31840 // srl(or(ctlz, ctlz)).
31841 EVT VT = OR->getValueType(0);
31842 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31843 SDValue Ret, NewRHS;
31844 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31845 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31850 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31851 while (ORNodes.size() > 0) {
31852 OR = ORNodes.pop_back_val();
31853 LHS = OR->getOperand(0);
31854 RHS = OR->getOperand(1);
31855 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31856 if (RHS->getOpcode() == ISD::OR)
31857 std::swap(LHS, RHS);
31858 EVT VT = OR->getValueType(0);
31859 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31862 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31866 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31871 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31872 TargetLowering::DAGCombinerInfo &DCI,
31873 const X86Subtarget &Subtarget) {
31874 if (DCI.isBeforeLegalizeOps())
31877 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31880 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31883 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31886 SDValue N0 = N->getOperand(0);
31887 SDValue N1 = N->getOperand(1);
31888 EVT VT = N->getValueType(0);
31890 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31893 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31894 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31896 // SHLD/SHRD instructions have lower register pressure, but on some
31897 // platforms they have higher latency than the equivalent
31898 // series of shifts/or that would otherwise be generated.
31899 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31900 // have higher latencies and we are not optimizing for size.
31901 if (!OptForSize && Subtarget.isSHLDSlow())
31904 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31906 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31908 if (!N0.hasOneUse() || !N1.hasOneUse())
31911 SDValue ShAmt0 = N0.getOperand(1);
31912 if (ShAmt0.getValueType() != MVT::i8)
31914 SDValue ShAmt1 = N1.getOperand(1);
31915 if (ShAmt1.getValueType() != MVT::i8)
31917 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31918 ShAmt0 = ShAmt0.getOperand(0);
31919 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31920 ShAmt1 = ShAmt1.getOperand(0);
31923 unsigned Opc = X86ISD::SHLD;
31924 SDValue Op0 = N0.getOperand(0);
31925 SDValue Op1 = N1.getOperand(0);
31926 if (ShAmt0.getOpcode() == ISD::SUB ||
31927 ShAmt0.getOpcode() == ISD::XOR) {
31928 Opc = X86ISD::SHRD;
31929 std::swap(Op0, Op1);
31930 std::swap(ShAmt0, ShAmt1);
31933 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31934 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31935 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31936 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31937 unsigned Bits = VT.getSizeInBits();
31938 if (ShAmt1.getOpcode() == ISD::SUB) {
31939 SDValue Sum = ShAmt1.getOperand(0);
31940 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31941 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31942 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31943 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
31944 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
31945 return DAG.getNode(Opc, DL, VT,
31947 DAG.getNode(ISD::TRUNCATE, DL,
31950 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
31951 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
31952 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
31953 return DAG.getNode(Opc, DL, VT,
31954 N0.getOperand(0), N1.getOperand(0),
31955 DAG.getNode(ISD::TRUNCATE, DL,
31957 } else if (ShAmt1.getOpcode() == ISD::XOR) {
31958 SDValue Mask = ShAmt1.getOperand(1);
31959 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
31960 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
31961 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
31962 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
31963 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
31964 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
31965 if (Op1.getOpcode() == InnerShift &&
31966 isa<ConstantSDNode>(Op1.getOperand(1)) &&
31967 Op1.getConstantOperandVal(1) == 1) {
31968 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31969 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31971 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
31972 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
31973 Op1.getOperand(0) == Op1.getOperand(1)) {
31974 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31975 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31984 /// Generate NEG and CMOV for integer abs.
31985 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
31986 EVT VT = N->getValueType(0);
31988 // Since X86 does not have CMOV for 8-bit integer, we don't convert
31989 // 8-bit integer abs to NEG and CMOV.
31990 if (VT.isInteger() && VT.getSizeInBits() == 8)
31993 SDValue N0 = N->getOperand(0);
31994 SDValue N1 = N->getOperand(1);
31997 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
31998 // and change it to SUB and CMOV.
31999 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32000 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32001 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32002 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32003 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32004 // Generate SUB & CMOV.
32005 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32006 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32007 SDValue Ops[] = {N0.getOperand(0), Neg,
32008 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32009 SDValue(Neg.getNode(), 1)};
32010 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32016 /// Try to turn tests against the signbit in the form of:
32017 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32020 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32021 // This is only worth doing if the output type is i8 or i1.
32022 EVT ResultType = N->getValueType(0);
32023 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32026 SDValue N0 = N->getOperand(0);
32027 SDValue N1 = N->getOperand(1);
32029 // We should be performing an xor against a truncated shift.
32030 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32033 // Make sure we are performing an xor against one.
32034 if (!isOneConstant(N1))
32037 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32038 SDValue Shift = N0.getOperand(0);
32039 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32042 // Make sure we are truncating from one of i16, i32 or i64.
32043 EVT ShiftTy = Shift.getValueType();
32044 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32047 // Make sure the shift amount extracts the sign bit.
32048 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32049 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32052 // Create a greater-than comparison against -1.
32053 // N.B. Using SETGE against 0 works but we want a canonical looking
32054 // comparison, using SETGT matches up with what TranslateX86CC.
32056 SDValue ShiftOp = Shift.getOperand(0);
32057 EVT ShiftOpTy = ShiftOp.getValueType();
32058 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32059 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32060 *DAG.getContext(), ResultType);
32061 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32062 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32063 if (SetCCResultType != ResultType)
32064 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32068 /// Turn vector tests of the signbit in the form of:
32069 /// xor (sra X, elt_size(X)-1), -1
32073 /// This should be called before type legalization because the pattern may not
32074 /// persist after that.
32075 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32076 const X86Subtarget &Subtarget) {
32077 EVT VT = N->getValueType(0);
32078 if (!VT.isSimple())
32081 switch (VT.getSimpleVT().SimpleTy) {
32082 default: return SDValue();
32085 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32086 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32090 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32093 // There must be a shift right algebraic before the xor, and the xor must be a
32094 // 'not' operation.
32095 SDValue Shift = N->getOperand(0);
32096 SDValue Ones = N->getOperand(1);
32097 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32098 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32101 // The shift should be smearing the sign bit across each vector element.
32102 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32106 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32107 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32108 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32111 // Create a greater-than comparison against -1. We don't use the more obvious
32112 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32113 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32116 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32117 /// is valid for the given \p Subtarget.
32118 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32119 const X86Subtarget &Subtarget) {
32120 if (!Subtarget.hasAVX512())
32123 // FIXME: Scalar type may be supported if we move it to vector register.
32124 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32127 EVT SrcElVT = SrcVT.getScalarType();
32128 EVT DstElVT = DstVT.getScalarType();
32129 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32131 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32133 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32134 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32138 /// Detect a pattern of truncation with saturation:
32139 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32140 /// Return the source value to be truncated or SDValue() if the pattern was not
32142 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32143 if (In.getOpcode() != ISD::UMIN)
32146 //Saturation with truncation. We truncate from InVT to VT.
32147 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32148 "Unexpected types for truncate operation");
32151 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32152 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32153 // the element size of the destination type.
32154 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32160 /// Detect a pattern of truncation with saturation:
32161 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32162 /// The types should allow to use VPMOVUS* instruction on AVX512.
32163 /// Return the source value to be truncated or SDValue() if the pattern was not
32165 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32166 const X86Subtarget &Subtarget) {
32167 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32169 return detectUSatPattern(In, VT);
32173 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32174 const X86Subtarget &Subtarget) {
32175 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32176 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32178 if (auto USatVal = detectUSatPattern(In, VT))
32179 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32180 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32184 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32185 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32186 /// X86ISD::AVG instruction.
32187 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32188 const X86Subtarget &Subtarget,
32190 if (!VT.isVector() || !VT.isSimple())
32192 EVT InVT = In.getValueType();
32193 unsigned NumElems = VT.getVectorNumElements();
32195 EVT ScalarVT = VT.getVectorElementType();
32196 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32197 isPowerOf2_32(NumElems)))
32200 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32201 // than the original input type (i8/i16).
32202 EVT InScalarVT = InVT.getVectorElementType();
32203 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32206 if (!Subtarget.hasSSE2())
32208 if (Subtarget.hasBWI()) {
32209 if (VT.getSizeInBits() > 512)
32211 } else if (Subtarget.hasAVX2()) {
32212 if (VT.getSizeInBits() > 256)
32215 if (VT.getSizeInBits() > 128)
32219 // Detect the following pattern:
32221 // %1 = zext <N x i8> %a to <N x i32>
32222 // %2 = zext <N x i8> %b to <N x i32>
32223 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32224 // %4 = add nuw nsw <N x i32> %3, %2
32225 // %5 = lshr <N x i32> %N, <i32 1 x N>
32226 // %6 = trunc <N x i32> %5 to <N x i8>
32228 // In AVX512, the last instruction can also be a trunc store.
32230 if (In.getOpcode() != ISD::SRL)
32233 // A lambda checking the given SDValue is a constant vector and each element
32234 // is in the range [Min, Max].
32235 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32236 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32237 if (!BV || !BV->isConstant())
32239 for (SDValue Op : V->ops()) {
32240 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32243 uint64_t Val = C->getZExtValue();
32244 if (Val < Min || Val > Max)
32250 // Check if each element of the vector is left-shifted by one.
32251 auto LHS = In.getOperand(0);
32252 auto RHS = In.getOperand(1);
32253 if (!IsConstVectorInRange(RHS, 1, 1))
32255 if (LHS.getOpcode() != ISD::ADD)
32258 // Detect a pattern of a + b + 1 where the order doesn't matter.
32259 SDValue Operands[3];
32260 Operands[0] = LHS.getOperand(0);
32261 Operands[1] = LHS.getOperand(1);
32263 // Take care of the case when one of the operands is a constant vector whose
32264 // element is in the range [1, 256].
32265 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32266 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32267 Operands[0].getOperand(0).getValueType() == VT) {
32268 // The pattern is detected. Subtract one from the constant vector, then
32269 // demote it and emit X86ISD::AVG instruction.
32270 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32271 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32272 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32273 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32277 if (Operands[0].getOpcode() == ISD::ADD)
32278 std::swap(Operands[0], Operands[1]);
32279 else if (Operands[1].getOpcode() != ISD::ADD)
32281 Operands[2] = Operands[1].getOperand(0);
32282 Operands[1] = Operands[1].getOperand(1);
32284 // Now we have three operands of two additions. Check that one of them is a
32285 // constant vector with ones, and the other two are promoted from i8/i16.
32286 for (int i = 0; i < 3; ++i) {
32287 if (!IsConstVectorInRange(Operands[i], 1, 1))
32289 std::swap(Operands[i], Operands[2]);
32291 // Check if Operands[0] and Operands[1] are results of type promotion.
32292 for (int j = 0; j < 2; ++j)
32293 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32294 Operands[j].getOperand(0).getValueType() != VT)
32297 // The pattern is detected, emit X86ISD::AVG instruction.
32298 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32299 Operands[1].getOperand(0));
32305 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32306 TargetLowering::DAGCombinerInfo &DCI,
32307 const X86Subtarget &Subtarget) {
32308 LoadSDNode *Ld = cast<LoadSDNode>(N);
32309 EVT RegVT = Ld->getValueType(0);
32310 EVT MemVT = Ld->getMemoryVT();
32312 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32314 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32315 // into two 16-byte operations.
32316 ISD::LoadExtType Ext = Ld->getExtensionType();
32318 unsigned AddressSpace = Ld->getAddressSpace();
32319 unsigned Alignment = Ld->getAlignment();
32320 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32321 Ext == ISD::NON_EXTLOAD &&
32322 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32323 AddressSpace, Alignment, &Fast) && !Fast) {
32324 unsigned NumElems = RegVT.getVectorNumElements();
32328 SDValue Ptr = Ld->getBasePtr();
32330 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32333 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32334 Alignment, Ld->getMemOperand()->getFlags());
32336 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32338 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32339 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32340 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32342 Load2.getValue(1));
32344 SDValue NewVec = DAG.getUNDEF(RegVT);
32345 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32346 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32347 return DCI.CombineTo(N, NewVec, TF, true);
32353 /// If V is a build vector of boolean constants and exactly one of those
32354 /// constants is true, return the operand index of that true element.
32355 /// Otherwise, return -1.
32356 static int getOneTrueElt(SDValue V) {
32357 // This needs to be a build vector of booleans.
32358 // TODO: Checking for the i1 type matches the IR definition for the mask,
32359 // but the mask check could be loosened to i8 or other types. That might
32360 // also require checking more than 'allOnesValue'; eg, the x86 HW
32361 // instructions only require that the MSB is set for each mask element.
32362 // The ISD::MSTORE comments/definition do not specify how the mask operand
32364 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32365 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32368 int TrueIndex = -1;
32369 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32370 for (unsigned i = 0; i < NumElts; ++i) {
32371 const SDValue &Op = BV->getOperand(i);
32374 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32377 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32378 // If we already found a one, this is too many.
32379 if (TrueIndex >= 0)
32387 /// Given a masked memory load/store operation, return true if it has one mask
32388 /// bit set. If it has one mask bit set, then also return the memory address of
32389 /// the scalar element to load/store, the vector index to insert/extract that
32390 /// scalar element, and the alignment for the scalar memory access.
32391 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32392 SelectionDAG &DAG, SDValue &Addr,
32393 SDValue &Index, unsigned &Alignment) {
32394 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32395 if (TrueMaskElt < 0)
32398 // Get the address of the one scalar element that is specified by the mask
32399 // using the appropriate offset from the base pointer.
32400 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32401 Addr = MaskedOp->getBasePtr();
32402 if (TrueMaskElt != 0) {
32403 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32404 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32407 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32408 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32412 /// If exactly one element of the mask is set for a non-extending masked load,
32413 /// it is a scalar load and vector insert.
32414 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32415 /// mask have already been optimized in IR, so we don't bother with those here.
32417 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32418 TargetLowering::DAGCombinerInfo &DCI) {
32419 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32420 // However, some target hooks may need to be added to know when the transform
32421 // is profitable. Endianness would also have to be considered.
32423 SDValue Addr, VecIndex;
32424 unsigned Alignment;
32425 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32428 // Load the one scalar element that is specified by the mask using the
32429 // appropriate offset from the base pointer.
32431 EVT VT = ML->getValueType(0);
32432 EVT EltVT = VT.getVectorElementType();
32434 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32435 Alignment, ML->getMemOperand()->getFlags());
32437 // Insert the loaded element into the appropriate place in the vector.
32438 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32440 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32444 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32445 TargetLowering::DAGCombinerInfo &DCI) {
32446 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32450 EVT VT = ML->getValueType(0);
32452 // If we are loading the first and last elements of a vector, it is safe and
32453 // always faster to load the whole vector. Replace the masked load with a
32454 // vector load and select.
32455 unsigned NumElts = VT.getVectorNumElements();
32456 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32457 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32458 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32459 if (LoadFirstElt && LoadLastElt) {
32460 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32461 ML->getMemOperand());
32462 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32463 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32466 // Convert a masked load with a constant mask into a masked load and a select.
32467 // This allows the select operation to use a faster kind of select instruction
32468 // (for example, vblendvps -> vblendps).
32470 // Don't try this if the pass-through operand is already undefined. That would
32471 // cause an infinite loop because that's what we're about to create.
32472 if (ML->getSrc0().isUndef())
32475 // The new masked load has an undef pass-through operand. The select uses the
32476 // original pass-through operand.
32477 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32478 ML->getMask(), DAG.getUNDEF(VT),
32479 ML->getMemoryVT(), ML->getMemOperand(),
32480 ML->getExtensionType());
32481 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32483 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32486 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32487 TargetLowering::DAGCombinerInfo &DCI,
32488 const X86Subtarget &Subtarget) {
32489 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32491 // TODO: Expanding load with constant mask may be optimized as well.
32492 if (Mld->isExpandingLoad())
32495 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32496 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32498 // TODO: Do some AVX512 subsets benefit from this transform?
32499 if (!Subtarget.hasAVX512())
32500 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32504 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32507 // Resolve extending loads.
32508 EVT VT = Mld->getValueType(0);
32509 unsigned NumElems = VT.getVectorNumElements();
32510 EVT LdVT = Mld->getMemoryVT();
32513 assert(LdVT != VT && "Cannot extend to the same type");
32514 unsigned ToSz = VT.getScalarSizeInBits();
32515 unsigned FromSz = LdVT.getScalarSizeInBits();
32516 // From/To sizes and ElemCount must be pow of two.
32517 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32518 "Unexpected size for extending masked load");
32520 unsigned SizeRatio = ToSz / FromSz;
32521 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32523 // Create a type on which we perform the shuffle.
32524 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32525 LdVT.getScalarType(), NumElems*SizeRatio);
32526 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32528 // Convert Src0 value.
32529 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32530 if (!Mld->getSrc0().isUndef()) {
32531 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32532 for (unsigned i = 0; i != NumElems; ++i)
32533 ShuffleVec[i] = i * SizeRatio;
32535 // Can't shuffle using an illegal type.
32536 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32537 "WideVecVT should be legal");
32538 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32539 DAG.getUNDEF(WideVecVT), ShuffleVec);
32541 // Prepare the new mask.
32543 SDValue Mask = Mld->getMask();
32544 if (Mask.getValueType() == VT) {
32545 // Mask and original value have the same type.
32546 NewMask = DAG.getBitcast(WideVecVT, Mask);
32547 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32548 for (unsigned i = 0; i != NumElems; ++i)
32549 ShuffleVec[i] = i * SizeRatio;
32550 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32551 ShuffleVec[i] = NumElems * SizeRatio;
32552 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32553 DAG.getConstant(0, dl, WideVecVT),
32556 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32557 unsigned WidenNumElts = NumElems*SizeRatio;
32558 unsigned MaskNumElts = VT.getVectorNumElements();
32559 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32562 unsigned NumConcat = WidenNumElts / MaskNumElts;
32563 SmallVector<SDValue, 16> Ops(NumConcat);
32564 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32566 for (unsigned i = 1; i != NumConcat; ++i)
32569 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32572 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32573 Mld->getBasePtr(), NewMask, WideSrc0,
32574 Mld->getMemoryVT(), Mld->getMemOperand(),
32576 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32577 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32580 /// If exactly one element of the mask is set for a non-truncating masked store,
32581 /// it is a vector extract and scalar store.
32582 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32583 /// mask have already been optimized in IR, so we don't bother with those here.
32584 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32585 SelectionDAG &DAG) {
32586 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32587 // However, some target hooks may need to be added to know when the transform
32588 // is profitable. Endianness would also have to be considered.
32590 SDValue Addr, VecIndex;
32591 unsigned Alignment;
32592 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32595 // Extract the one scalar element that is actually being stored.
32597 EVT VT = MS->getValue().getValueType();
32598 EVT EltVT = VT.getVectorElementType();
32599 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32600 MS->getValue(), VecIndex);
32602 // Store that element at the appropriate offset from the base pointer.
32603 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32604 Alignment, MS->getMemOperand()->getFlags());
32607 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32608 const X86Subtarget &Subtarget) {
32609 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32611 if (Mst->isCompressingStore())
32614 if (!Mst->isTruncatingStore())
32615 return reduceMaskedStoreToScalarStore(Mst, DAG);
32617 // Resolve truncating stores.
32618 EVT VT = Mst->getValue().getValueType();
32619 unsigned NumElems = VT.getVectorNumElements();
32620 EVT StVT = Mst->getMemoryVT();
32623 assert(StVT != VT && "Cannot truncate to the same type");
32624 unsigned FromSz = VT.getScalarSizeInBits();
32625 unsigned ToSz = StVT.getScalarSizeInBits();
32627 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32629 // The truncating store is legal in some cases. For example
32630 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32631 // are designated for truncate store.
32632 // In this case we don't need any further transformations.
32633 if (TLI.isTruncStoreLegal(VT, StVT))
32636 // From/To sizes and ElemCount must be pow of two.
32637 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32638 "Unexpected size for truncating masked store");
32639 // We are going to use the original vector elt for storing.
32640 // Accumulated smaller vector elements must be a multiple of the store size.
32641 assert (((NumElems * FromSz) % ToSz) == 0 &&
32642 "Unexpected ratio for truncating masked store");
32644 unsigned SizeRatio = FromSz / ToSz;
32645 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32647 // Create a type on which we perform the shuffle.
32648 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32649 StVT.getScalarType(), NumElems*SizeRatio);
32651 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32653 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32654 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32655 for (unsigned i = 0; i != NumElems; ++i)
32656 ShuffleVec[i] = i * SizeRatio;
32658 // Can't shuffle using an illegal type.
32659 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32660 "WideVecVT should be legal");
32662 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32663 DAG.getUNDEF(WideVecVT),
32667 SDValue Mask = Mst->getMask();
32668 if (Mask.getValueType() == VT) {
32669 // Mask and original value have the same type.
32670 NewMask = DAG.getBitcast(WideVecVT, Mask);
32671 for (unsigned i = 0; i != NumElems; ++i)
32672 ShuffleVec[i] = i * SizeRatio;
32673 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32674 ShuffleVec[i] = NumElems*SizeRatio;
32675 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32676 DAG.getConstant(0, dl, WideVecVT),
32679 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32680 unsigned WidenNumElts = NumElems*SizeRatio;
32681 unsigned MaskNumElts = VT.getVectorNumElements();
32682 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32685 unsigned NumConcat = WidenNumElts / MaskNumElts;
32686 SmallVector<SDValue, 16> Ops(NumConcat);
32687 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32689 for (unsigned i = 1; i != NumConcat; ++i)
32692 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32695 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32696 Mst->getBasePtr(), NewMask, StVT,
32697 Mst->getMemOperand(), false);
32700 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32701 const X86Subtarget &Subtarget) {
32702 StoreSDNode *St = cast<StoreSDNode>(N);
32703 EVT VT = St->getValue().getValueType();
32704 EVT StVT = St->getMemoryVT();
32706 SDValue StoredVal = St->getOperand(1);
32707 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32709 // If we are saving a concatenation of two XMM registers and 32-byte stores
32710 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32712 unsigned AddressSpace = St->getAddressSpace();
32713 unsigned Alignment = St->getAlignment();
32714 if (VT.is256BitVector() && StVT == VT &&
32715 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32716 AddressSpace, Alignment, &Fast) &&
32718 unsigned NumElems = VT.getVectorNumElements();
32722 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32723 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32725 SDValue Ptr0 = St->getBasePtr();
32726 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32729 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32730 Alignment, St->getMemOperand()->getFlags());
32732 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32733 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32734 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32737 // Optimize trunc store (of multiple scalars) to shuffle and store.
32738 // First, pack all of the elements in one place. Next, store to memory
32739 // in fewer chunks.
32740 if (St->isTruncatingStore() && VT.isVector()) {
32741 // Check if we can detect an AVG pattern from the truncation. If yes,
32742 // replace the trunc store by a normal store with the result of X86ISD::AVG
32744 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32746 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32747 St->getPointerInfo(), St->getAlignment(),
32748 St->getMemOperand()->getFlags());
32751 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32752 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32753 dl, Val, St->getBasePtr(),
32754 St->getMemoryVT(), St->getMemOperand(), DAG);
32756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32757 unsigned NumElems = VT.getVectorNumElements();
32758 assert(StVT != VT && "Cannot truncate to the same type");
32759 unsigned FromSz = VT.getScalarSizeInBits();
32760 unsigned ToSz = StVT.getScalarSizeInBits();
32762 // The truncating store is legal in some cases. For example
32763 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32764 // are designated for truncate store.
32765 // In this case we don't need any further transformations.
32766 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32769 // From, To sizes and ElemCount must be pow of two
32770 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32771 // We are going to use the original vector elt for storing.
32772 // Accumulated smaller vector elements must be a multiple of the store size.
32773 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32775 unsigned SizeRatio = FromSz / ToSz;
32777 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32779 // Create a type on which we perform the shuffle
32780 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32781 StVT.getScalarType(), NumElems*SizeRatio);
32783 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32785 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32786 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32787 for (unsigned i = 0; i != NumElems; ++i)
32788 ShuffleVec[i] = i * SizeRatio;
32790 // Can't shuffle using an illegal type.
32791 if (!TLI.isTypeLegal(WideVecVT))
32794 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32795 DAG.getUNDEF(WideVecVT),
32797 // At this point all of the data is stored at the bottom of the
32798 // register. We now need to save it to mem.
32800 // Find the largest store unit
32801 MVT StoreType = MVT::i8;
32802 for (MVT Tp : MVT::integer_valuetypes()) {
32803 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32807 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32808 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32809 (64 <= NumElems * ToSz))
32810 StoreType = MVT::f64;
32812 // Bitcast the original vector into a vector of store-size units
32813 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32814 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32815 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32816 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32817 SmallVector<SDValue, 8> Chains;
32818 SDValue Ptr = St->getBasePtr();
32820 // Perform one or more big stores into memory.
32821 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32822 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32823 StoreType, ShuffWide,
32824 DAG.getIntPtrConstant(i, dl));
32826 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32827 St->getAlignment(), St->getMemOperand()->getFlags());
32828 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32829 Chains.push_back(Ch);
32832 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32835 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32836 // the FP state in cases where an emms may be missing.
32837 // A preferable solution to the general problem is to figure out the right
32838 // places to insert EMMS. This qualifies as a quick hack.
32840 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32841 if (VT.getSizeInBits() != 64)
32844 const Function *F = DAG.getMachineFunction().getFunction();
32845 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32847 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32848 if ((VT.isVector() ||
32849 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32850 isa<LoadSDNode>(St->getValue()) &&
32851 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32852 St->getChain().hasOneUse() && !St->isVolatile()) {
32853 SDNode* LdVal = St->getValue().getNode();
32854 LoadSDNode *Ld = nullptr;
32855 int TokenFactorIndex = -1;
32856 SmallVector<SDValue, 8> Ops;
32857 SDNode* ChainVal = St->getChain().getNode();
32858 // Must be a store of a load. We currently handle two cases: the load
32859 // is a direct child, and it's under an intervening TokenFactor. It is
32860 // possible to dig deeper under nested TokenFactors.
32861 if (ChainVal == LdVal)
32862 Ld = cast<LoadSDNode>(St->getChain());
32863 else if (St->getValue().hasOneUse() &&
32864 ChainVal->getOpcode() == ISD::TokenFactor) {
32865 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32866 if (ChainVal->getOperand(i).getNode() == LdVal) {
32867 TokenFactorIndex = i;
32868 Ld = cast<LoadSDNode>(St->getValue());
32870 Ops.push_back(ChainVal->getOperand(i));
32874 if (!Ld || !ISD::isNormalLoad(Ld))
32877 // If this is not the MMX case, i.e. we are just turning i64 load/store
32878 // into f64 load/store, avoid the transformation if there are multiple
32879 // uses of the loaded value.
32880 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32885 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32886 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32888 if (Subtarget.is64Bit() || F64IsLegal) {
32889 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32890 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32891 Ld->getPointerInfo(), Ld->getAlignment(),
32892 Ld->getMemOperand()->getFlags());
32893 SDValue NewChain = NewLd.getValue(1);
32894 if (TokenFactorIndex >= 0) {
32895 Ops.push_back(NewChain);
32896 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32898 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32899 St->getPointerInfo(), St->getAlignment(),
32900 St->getMemOperand()->getFlags());
32903 // Otherwise, lower to two pairs of 32-bit loads / stores.
32904 SDValue LoAddr = Ld->getBasePtr();
32905 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32907 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32908 Ld->getPointerInfo(), Ld->getAlignment(),
32909 Ld->getMemOperand()->getFlags());
32910 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32911 Ld->getPointerInfo().getWithOffset(4),
32912 MinAlign(Ld->getAlignment(), 4),
32913 Ld->getMemOperand()->getFlags());
32915 SDValue NewChain = LoLd.getValue(1);
32916 if (TokenFactorIndex >= 0) {
32917 Ops.push_back(LoLd);
32918 Ops.push_back(HiLd);
32919 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32922 LoAddr = St->getBasePtr();
32923 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32926 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32927 St->getAlignment(), St->getMemOperand()->getFlags());
32928 SDValue HiSt = DAG.getStore(
32929 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32930 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32931 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32934 // This is similar to the above case, but here we handle a scalar 64-bit
32935 // integer store that is extracted from a vector on a 32-bit target.
32936 // If we have SSE2, then we can treat it like a floating-point double
32937 // to get past legalization. The execution dependencies fixup pass will
32938 // choose the optimal machine instruction for the store if this really is
32939 // an integer or v2f32 rather than an f64.
32940 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32941 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32942 SDValue OldExtract = St->getOperand(1);
32943 SDValue ExtOp0 = OldExtract.getOperand(0);
32944 unsigned VecSize = ExtOp0.getValueSizeInBits();
32945 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
32946 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
32947 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
32948 BitCast, OldExtract.getOperand(1));
32949 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
32950 St->getPointerInfo(), St->getAlignment(),
32951 St->getMemOperand()->getFlags());
32957 /// Return 'true' if this vector operation is "horizontal"
32958 /// and return the operands for the horizontal operation in LHS and RHS. A
32959 /// horizontal operation performs the binary operation on successive elements
32960 /// of its first operand, then on successive elements of its second operand,
32961 /// returning the resulting values in a vector. For example, if
32962 /// A = < float a0, float a1, float a2, float a3 >
32964 /// B = < float b0, float b1, float b2, float b3 >
32965 /// then the result of doing a horizontal operation on A and B is
32966 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
32967 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
32968 /// A horizontal-op B, for some already available A and B, and if so then LHS is
32969 /// set to A, RHS to B, and the routine returns 'true'.
32970 /// Note that the binary operation should have the property that if one of the
32971 /// operands is UNDEF then the result is UNDEF.
32972 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
32973 // Look for the following pattern: if
32974 // A = < float a0, float a1, float a2, float a3 >
32975 // B = < float b0, float b1, float b2, float b3 >
32977 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
32978 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
32979 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
32980 // which is A horizontal-op B.
32982 // At least one of the operands should be a vector shuffle.
32983 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
32984 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
32987 MVT VT = LHS.getSimpleValueType();
32989 assert((VT.is128BitVector() || VT.is256BitVector()) &&
32990 "Unsupported vector type for horizontal add/sub");
32992 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
32993 // operate independently on 128-bit lanes.
32994 unsigned NumElts = VT.getVectorNumElements();
32995 unsigned NumLanes = VT.getSizeInBits()/128;
32996 unsigned NumLaneElts = NumElts / NumLanes;
32997 assert((NumLaneElts % 2 == 0) &&
32998 "Vector type should have an even number of elements in each lane");
32999 unsigned HalfLaneElts = NumLaneElts/2;
33001 // View LHS in the form
33002 // LHS = VECTOR_SHUFFLE A, B, LMask
33003 // If LHS is not a shuffle then pretend it is the shuffle
33004 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33005 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33008 SmallVector<int, 16> LMask(NumElts);
33009 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33010 if (!LHS.getOperand(0).isUndef())
33011 A = LHS.getOperand(0);
33012 if (!LHS.getOperand(1).isUndef())
33013 B = LHS.getOperand(1);
33014 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33015 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33017 if (!LHS.isUndef())
33019 for (unsigned i = 0; i != NumElts; ++i)
33023 // Likewise, view RHS in the form
33024 // RHS = VECTOR_SHUFFLE C, D, RMask
33026 SmallVector<int, 16> RMask(NumElts);
33027 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33028 if (!RHS.getOperand(0).isUndef())
33029 C = RHS.getOperand(0);
33030 if (!RHS.getOperand(1).isUndef())
33031 D = RHS.getOperand(1);
33032 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33033 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33035 if (!RHS.isUndef())
33037 for (unsigned i = 0; i != NumElts; ++i)
33041 // Check that the shuffles are both shuffling the same vectors.
33042 if (!(A == C && B == D) && !(A == D && B == C))
33045 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33046 if (!A.getNode() && !B.getNode())
33049 // If A and B occur in reverse order in RHS, then "swap" them (which means
33050 // rewriting the mask).
33052 ShuffleVectorSDNode::commuteMask(RMask);
33054 // At this point LHS and RHS are equivalent to
33055 // LHS = VECTOR_SHUFFLE A, B, LMask
33056 // RHS = VECTOR_SHUFFLE A, B, RMask
33057 // Check that the masks correspond to performing a horizontal operation.
33058 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33059 for (unsigned i = 0; i != NumLaneElts; ++i) {
33060 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33062 // Ignore any UNDEF components.
33063 if (LIdx < 0 || RIdx < 0 ||
33064 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33065 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33068 // Check that successive elements are being operated on. If not, this is
33069 // not a horizontal operation.
33070 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33071 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33072 if (!(LIdx == Index && RIdx == Index + 1) &&
33073 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33078 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33079 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33083 /// Do target-specific dag combines on floating-point adds/subs.
33084 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33085 const X86Subtarget &Subtarget) {
33086 EVT VT = N->getValueType(0);
33087 SDValue LHS = N->getOperand(0);
33088 SDValue RHS = N->getOperand(1);
33089 bool IsFadd = N->getOpcode() == ISD::FADD;
33090 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33092 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33093 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33094 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33095 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33096 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33097 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33102 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33104 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33105 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33106 const X86Subtarget &Subtarget,
33108 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33109 SDValue Src = N->getOperand(0);
33110 unsigned Opcode = Src.getOpcode();
33111 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33113 EVT VT = N->getValueType(0);
33114 EVT SrcVT = Src.getValueType();
33116 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33117 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33119 // Repeated operand, so we are only trading one output truncation for
33120 // one input truncation.
33124 // See if either operand has been extended from a smaller/equal size to
33125 // the truncation size, allowing a truncation to combine with the extend.
33126 unsigned Opcode0 = Op0.getOpcode();
33127 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33128 Opcode0 == ISD::ZERO_EXTEND) &&
33129 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33132 unsigned Opcode1 = Op1.getOpcode();
33133 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33134 Opcode1 == ISD::ZERO_EXTEND) &&
33135 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33138 // See if either operand is a single use constant which can be constant
33140 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33141 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33142 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33143 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33146 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33147 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33148 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33149 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33152 // Don't combine if the operation has other uses.
33153 if (!N->isOnlyUserOf(Src.getNode()))
33156 // Only support vector truncation for now.
33157 // TODO: i64 scalar math would benefit as well.
33158 if (!VT.isVector())
33161 // In most cases its only worth pre-truncating if we're only facing the cost
33162 // of one truncation.
33163 // i.e. if one of the inputs will constant fold or the input is repeated.
33168 SDValue Op0 = Src.getOperand(0);
33169 SDValue Op1 = Src.getOperand(1);
33170 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33171 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33172 return TruncateArithmetic(Op0, Op1);
33177 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33178 // better to truncate if we have the chance.
33179 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33180 !TLI.isOperationLegal(Opcode, SrcVT))
33181 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33184 SDValue Op0 = Src.getOperand(0);
33185 SDValue Op1 = Src.getOperand(1);
33186 if (TLI.isOperationLegal(Opcode, VT) &&
33187 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33188 return TruncateArithmetic(Op0, Op1);
33196 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33198 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33199 SmallVector<SDValue, 8> &Regs) {
33200 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33201 Regs[0].getValueType() == MVT::v2i64));
33202 EVT OutVT = N->getValueType(0);
33203 EVT OutSVT = OutVT.getVectorElementType();
33204 EVT InVT = Regs[0].getValueType();
33205 EVT InSVT = InVT.getVectorElementType();
33208 // First, use mask to unset all bits that won't appear in the result.
33209 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33210 "OutSVT can only be either i8 or i16.");
33212 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33213 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33214 for (auto &Reg : Regs)
33215 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33217 MVT UnpackedVT, PackedVT;
33218 if (OutSVT == MVT::i8) {
33219 UnpackedVT = MVT::v8i16;
33220 PackedVT = MVT::v16i8;
33222 UnpackedVT = MVT::v4i32;
33223 PackedVT = MVT::v8i16;
33226 // In each iteration, truncate the type by a half size.
33227 auto RegNum = Regs.size();
33228 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33229 j < e; j *= 2, RegNum /= 2) {
33230 for (unsigned i = 0; i < RegNum; i++)
33231 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33232 for (unsigned i = 0; i < RegNum / 2; i++)
33233 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33237 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33238 // then extract a subvector as the result since v8i8 is not a legal type.
33239 if (OutVT == MVT::v8i8) {
33240 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33241 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33242 DAG.getIntPtrConstant(0, DL));
33244 } else if (RegNum > 1) {
33245 Regs.resize(RegNum);
33246 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33251 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33253 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33255 SmallVector<SDValue, 8> &Regs) {
33256 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33257 EVT OutVT = N->getValueType(0);
33260 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33261 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33262 for (auto &Reg : Regs) {
33263 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33265 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33269 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33270 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33273 if (Regs.size() > 2) {
33274 Regs.resize(Regs.size() / 2);
33275 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33280 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33281 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33282 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33283 /// element that is extracted from a vector and then truncated, and it is
33284 /// difficult to do this optimization based on them.
33285 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33286 const X86Subtarget &Subtarget) {
33287 EVT OutVT = N->getValueType(0);
33288 if (!OutVT.isVector())
33291 SDValue In = N->getOperand(0);
33292 if (!In.getValueType().isSimple())
33295 EVT InVT = In.getValueType();
33296 unsigned NumElems = OutVT.getVectorNumElements();
33298 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33299 // SSE2, and we need to take care of it specially.
33300 // AVX512 provides vpmovdb.
33301 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33304 EVT OutSVT = OutVT.getVectorElementType();
33305 EVT InSVT = InVT.getVectorElementType();
33306 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33307 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33311 // SSSE3's pshufb results in less instructions in the cases below.
33312 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33313 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33314 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33319 // Split a long vector into vectors of legal type.
33320 unsigned RegNum = InVT.getSizeInBits() / 128;
33321 SmallVector<SDValue, 8> SubVec(RegNum);
33322 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33323 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33325 for (unsigned i = 0; i < RegNum; i++)
33326 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33327 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33329 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33330 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33331 // truncate 2 x v4i32 to v8i16.
33332 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33333 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33334 else if (InSVT == MVT::i32)
33335 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33340 /// This function transforms vector truncation of 'all or none' bits values.
33341 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33342 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33344 const X86Subtarget &Subtarget) {
33345 // Requires SSE2 but AVX512 has fast truncate.
33346 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33349 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33352 SDValue In = N->getOperand(0);
33353 if (!In.getValueType().isSimple())
33356 MVT VT = N->getValueType(0).getSimpleVT();
33357 MVT SVT = VT.getScalarType();
33359 MVT InVT = In.getValueType().getSimpleVT();
33360 MVT InSVT = InVT.getScalarType();
33362 // Use PACKSS if the input is a splatted sign bit.
33363 // e.g. Comparison result, sext_in_reg, etc.
33364 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33365 if (NumSignBits != InSVT.getSizeInBits())
33368 // Check we have a truncation suited for PACKSS.
33369 if (!VT.is128BitVector() && !VT.is256BitVector())
33371 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33373 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33376 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33379 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33380 const X86Subtarget &Subtarget) {
33381 EVT VT = N->getValueType(0);
33382 SDValue Src = N->getOperand(0);
33385 // Attempt to pre-truncate inputs to arithmetic ops instead.
33386 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33389 // Try to detect AVG pattern first.
33390 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33393 // Try to combine truncation with unsigned saturation.
33394 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33397 // The bitcast source is a direct mmx result.
33398 // Detect bitcasts between i32 to x86mmx
33399 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33400 SDValue BCSrc = Src.getOperand(0);
33401 if (BCSrc.getValueType() == MVT::x86mmx)
33402 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33405 // Try to truncate extended sign bits with PACKSS.
33406 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33409 return combineVectorTruncation(N, DAG, Subtarget);
33412 /// Returns the negated value if the node \p N flips sign of FP value.
33414 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33415 /// AVX512F does not have FXOR, so FNEG is lowered as
33416 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33417 /// In this case we go though all bitcasts.
33418 static SDValue isFNEG(SDNode *N) {
33419 if (N->getOpcode() == ISD::FNEG)
33420 return N->getOperand(0);
33422 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33423 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33426 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33427 if (!Op1.getValueType().isFloatingPoint())
33430 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33432 unsigned EltBits = Op1.getScalarValueSizeInBits();
33433 auto isSignMask = [&](const ConstantFP *C) {
33434 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33437 // There is more than one way to represent the same constant on
33438 // the different X86 targets. The type of the node may also depend on size.
33439 // - load scalar value and broadcast
33440 // - BUILD_VECTOR node
33441 // - load from a constant pool.
33442 // We check all variants here.
33443 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33444 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33445 if (isSignMask(cast<ConstantFP>(C)))
33448 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33449 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33450 if (isSignMask(CN->getConstantFPValue()))
33453 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33454 if (C->getType()->isVectorTy()) {
33455 if (auto *SplatV = C->getSplatValue())
33456 if (isSignMask(cast<ConstantFP>(SplatV)))
33458 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33459 if (isSignMask(FPConst))
33465 /// Do target-specific dag combines on floating point negations.
33466 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33467 const X86Subtarget &Subtarget) {
33468 EVT OrigVT = N->getValueType(0);
33469 SDValue Arg = isFNEG(N);
33470 assert(Arg.getNode() && "N is expected to be an FNEG node");
33472 EVT VT = Arg.getValueType();
33473 EVT SVT = VT.getScalarType();
33476 // Let legalize expand this if it isn't a legal type yet.
33477 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33480 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33481 // use of a constant by performing (-0 - A*B) instead.
33482 // FIXME: Check rounding control flags as well once it becomes available.
33483 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33484 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33485 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33486 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33487 Arg.getOperand(1), Zero);
33488 return DAG.getBitcast(OrigVT, NewNode);
33491 // If we're negating an FMA node, then we can adjust the
33492 // instruction to include the extra negation.
33493 unsigned NewOpcode = 0;
33494 if (Arg.hasOneUse()) {
33495 switch (Arg.getOpcode()) {
33496 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33497 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33498 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33499 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33500 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33501 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33502 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33503 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33504 // We can't handle scalar intrinsic node here because it would only
33505 // invert one element and not the whole vector. But we could try to handle
33506 // a negation of the lower element only.
33510 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33511 Arg.getNode()->ops()));
33516 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33517 const X86Subtarget &Subtarget) {
33518 MVT VT = N->getSimpleValueType(0);
33519 // If we have integer vector types available, use the integer opcodes.
33520 if (VT.isVector() && Subtarget.hasSSE2()) {
33523 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33525 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33526 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33527 unsigned IntOpcode;
33528 switch (N->getOpcode()) {
33529 default: llvm_unreachable("Unexpected FP logic op");
33530 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33531 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33532 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33533 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33535 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33536 return DAG.getBitcast(VT, IntOp);
33541 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33542 TargetLowering::DAGCombinerInfo &DCI,
33543 const X86Subtarget &Subtarget) {
33544 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33547 if (DCI.isBeforeLegalizeOps())
33550 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33553 if (Subtarget.hasCMov())
33554 if (SDValue RV = combineIntegerAbs(N, DAG))
33557 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33561 return combineFneg(N, DAG, Subtarget);
33566 static bool isNullFPScalarOrVectorConst(SDValue V) {
33567 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33570 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33571 /// undefined elements), return a zero constant that may be used to fold away
33572 /// that value. In the case of a vector, the returned constant will not contain
33573 /// undefined elements even if the input parameter does. This makes it suitable
33574 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33575 /// an undef should not propagate.
33576 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33577 const X86Subtarget &Subtarget) {
33578 if (!isNullFPScalarOrVectorConst(V))
33581 if (V.getValueType().isVector())
33582 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33587 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33588 const X86Subtarget &Subtarget) {
33589 SDValue N0 = N->getOperand(0);
33590 SDValue N1 = N->getOperand(1);
33591 EVT VT = N->getValueType(0);
33594 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33595 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33596 (VT == MVT::f64 && Subtarget.hasSSE2())))
33599 auto isAllOnesConstantFP = [](SDValue V) {
33600 auto *C = dyn_cast<ConstantFPSDNode>(V);
33601 return C && C->getConstantFPValue()->isAllOnesValue();
33604 // fand (fxor X, -1), Y --> fandn X, Y
33605 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33606 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33608 // fand X, (fxor Y, -1) --> fandn Y, X
33609 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33610 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33615 /// Do target-specific dag combines on X86ISD::FAND nodes.
33616 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33617 const X86Subtarget &Subtarget) {
33618 // FAND(0.0, x) -> 0.0
33619 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33622 // FAND(x, 0.0) -> 0.0
33623 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33626 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33629 return lowerX86FPLogicOp(N, DAG, Subtarget);
33632 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33633 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33634 const X86Subtarget &Subtarget) {
33635 // FANDN(0.0, x) -> x
33636 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33637 return N->getOperand(1);
33639 // FANDN(x, 0.0) -> 0.0
33640 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33643 return lowerX86FPLogicOp(N, DAG, Subtarget);
33646 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33647 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33648 const X86Subtarget &Subtarget) {
33649 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33651 // F[X]OR(0.0, x) -> x
33652 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33653 return N->getOperand(1);
33655 // F[X]OR(x, 0.0) -> x
33656 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33657 return N->getOperand(0);
33660 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33663 return lowerX86FPLogicOp(N, DAG, Subtarget);
33666 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33667 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33668 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33670 // Only perform optimizations if UnsafeMath is used.
33671 if (!DAG.getTarget().Options.UnsafeFPMath)
33674 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33675 // into FMINC and FMAXC, which are Commutative operations.
33676 unsigned NewOp = 0;
33677 switch (N->getOpcode()) {
33678 default: llvm_unreachable("unknown opcode");
33679 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33680 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33683 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33684 N->getOperand(0), N->getOperand(1));
33687 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33688 const X86Subtarget &Subtarget) {
33689 if (Subtarget.useSoftFloat())
33692 // TODO: Check for global or instruction-level "nnan". In that case, we
33693 // should be able to lower to FMAX/FMIN alone.
33694 // TODO: If an operand is already known to be a NaN or not a NaN, this
33695 // should be an optional swap and FMAX/FMIN.
33697 EVT VT = N->getValueType(0);
33698 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33699 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33700 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33703 // This takes at least 3 instructions, so favor a library call when operating
33704 // on a scalar and minimizing code size.
33705 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33708 SDValue Op0 = N->getOperand(0);
33709 SDValue Op1 = N->getOperand(1);
33711 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33712 DAG.getDataLayout(), *DAG.getContext(), VT);
33714 // There are 4 possibilities involving NaN inputs, and these are the required
33718 // ----------------
33719 // Num | Max | Op0 |
33720 // Op0 ----------------
33721 // NaN | Op1 | NaN |
33722 // ----------------
33724 // The SSE FP max/min instructions were not designed for this case, but rather
33726 // Min = Op1 < Op0 ? Op1 : Op0
33727 // Max = Op1 > Op0 ? Op1 : Op0
33729 // So they always return Op0 if either input is a NaN. However, we can still
33730 // use those instructions for fmaxnum by selecting away a NaN input.
33732 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33733 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33734 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33735 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33737 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33738 // are NaN, the NaN value of Op1 is the result.
33739 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
33740 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
33743 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33744 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33745 TargetLowering::DAGCombinerInfo &DCI,
33746 const X86Subtarget &Subtarget) {
33747 // ANDNP(0, x) -> x
33748 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33749 return N->getOperand(1);
33751 // ANDNP(x, 0) -> 0
33752 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33753 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33755 EVT VT = N->getValueType(0);
33757 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33758 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33760 SmallVector<int, 1> NonceMask; // Just a placeholder.
33761 NonceMask.push_back(0);
33762 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33763 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33765 return SDValue(); // This routine will use CombineTo to replace N.
33771 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33772 TargetLowering::DAGCombinerInfo &DCI) {
33773 // BT ignores high bits in the bit index operand.
33774 SDValue Op1 = N->getOperand(1);
33775 if (Op1.hasOneUse()) {
33776 unsigned BitWidth = Op1.getValueSizeInBits();
33777 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33778 APInt KnownZero, KnownOne;
33779 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33780 !DCI.isBeforeLegalizeOps());
33781 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33782 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33783 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
33784 DCI.CommitTargetLoweringOpt(TLO);
33789 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33790 const X86Subtarget &Subtarget) {
33791 EVT VT = N->getValueType(0);
33792 if (!VT.isVector())
33795 SDValue N0 = N->getOperand(0);
33796 SDValue N1 = N->getOperand(1);
33797 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33800 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33801 // both SSE and AVX2 since there is no sign-extended shift right
33802 // operation on a vector with 64-bit elements.
33803 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33804 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33805 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33806 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33807 SDValue N00 = N0.getOperand(0);
33809 // EXTLOAD has a better solution on AVX2,
33810 // it may be replaced with X86ISD::VSEXT node.
33811 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33812 if (!ISD::isNormalLoad(N00.getNode()))
33815 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33816 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33818 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33824 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33825 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33826 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33827 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33828 /// mode. This can eliminate extend, add, and shift instructions.
33829 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33830 const X86Subtarget &Subtarget) {
33831 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33832 Ext->getOpcode() != ISD::ZERO_EXTEND)
33835 // TODO: This should be valid for other integer types.
33836 EVT VT = Ext->getValueType(0);
33837 if (VT != MVT::i64)
33840 SDValue Add = Ext->getOperand(0);
33841 if (Add.getOpcode() != ISD::ADD)
33844 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33845 bool NSW = Add->getFlags()->hasNoSignedWrap();
33846 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
33848 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33850 if ((Sext && !NSW) || (!Sext && !NUW))
33853 // Having a constant operand to the 'add' ensures that we are not increasing
33854 // the instruction count because the constant is extended for free below.
33855 // A constant operand can also become the displacement field of an LEA.
33856 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33860 // Don't make the 'add' bigger if there's no hope of combining it with some
33861 // other 'add' or 'shl' instruction.
33862 // TODO: It may be profitable to generate simpler LEA instructions in place
33863 // of single 'add' instructions, but the cost model for selecting an LEA
33864 // currently has a high threshold.
33865 bool HasLEAPotential = false;
33866 for (auto *User : Ext->uses()) {
33867 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33868 HasLEAPotential = true;
33872 if (!HasLEAPotential)
33875 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33876 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33877 SDValue AddOp0 = Add.getOperand(0);
33878 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33879 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33881 // The wider add is guaranteed to not wrap because both operands are
33884 Flags.setNoSignedWrap(NSW);
33885 Flags.setNoUnsignedWrap(NUW);
33886 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
33889 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33890 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33891 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33892 /// extends from AH (which we otherwise need to do contortions to access).
33893 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33894 SDValue N0 = N->getOperand(0);
33895 auto OpcodeN = N->getOpcode();
33896 auto OpcodeN0 = N0.getOpcode();
33897 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33898 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33901 EVT VT = N->getValueType(0);
33902 EVT InVT = N0.getValueType();
33903 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33906 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33907 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33908 : X86ISD::UDIVREM8_ZEXT_HREG;
33909 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33911 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33912 return R.getValue(1);
33915 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33916 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33917 /// with UNDEFs) of the input to vectors of the same size as the target type
33918 /// which then extends the lowest elements.
33919 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33920 TargetLowering::DAGCombinerInfo &DCI,
33921 const X86Subtarget &Subtarget) {
33922 unsigned Opcode = N->getOpcode();
33923 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33925 if (!DCI.isBeforeLegalizeOps())
33927 if (!Subtarget.hasSSE2())
33930 SDValue N0 = N->getOperand(0);
33931 EVT VT = N->getValueType(0);
33932 EVT SVT = VT.getScalarType();
33933 EVT InVT = N0.getValueType();
33934 EVT InSVT = InVT.getScalarType();
33936 // Input type must be a vector and we must be extending legal integer types.
33937 if (!VT.isVector())
33939 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33941 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33944 // On AVX2+ targets, if the input/output types are both legal then we will be
33945 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
33946 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
33947 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
33952 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
33953 EVT InVT = N.getValueType();
33954 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
33955 Size / InVT.getScalarSizeInBits());
33956 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
33957 DAG.getUNDEF(InVT));
33959 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
33962 // If target-size is less than 128-bits, extend to a type that would extend
33963 // to 128 bits, extend that and extract the original target vector.
33964 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
33965 unsigned Scale = 128 / VT.getSizeInBits();
33967 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
33968 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
33969 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
33970 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
33971 DAG.getIntPtrConstant(0, DL));
33974 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
33975 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
33976 // Also use this if we don't have SSE41 to allow the legalizer do its job.
33977 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
33978 (VT.is256BitVector() && Subtarget.hasInt256()) ||
33979 (VT.is512BitVector() && Subtarget.hasAVX512())) {
33980 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
33981 return Opcode == ISD::SIGN_EXTEND
33982 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
33983 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
33986 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
33987 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
33988 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
33989 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
33990 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33992 SmallVector<SDValue, 8> Opnds;
33993 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
33994 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
33995 DAG.getIntPtrConstant(Offset, DL));
33996 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
33997 SrcVec = Opcode == ISD::SIGN_EXTEND
33998 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
33999 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34000 Opnds.push_back(SrcVec);
34002 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34005 // On pre-AVX2 targets, split into 128-bit nodes of
34006 // ISD::*_EXTEND_VECTOR_INREG.
34007 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34008 return SplitAndExtendInReg(128);
34010 // On pre-AVX512 targets, split into 256-bit nodes of
34011 // ISD::*_EXTEND_VECTOR_INREG.
34012 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34013 return SplitAndExtendInReg(256);
34018 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34019 TargetLowering::DAGCombinerInfo &DCI,
34020 const X86Subtarget &Subtarget) {
34021 SDValue N0 = N->getOperand(0);
34022 EVT VT = N->getValueType(0);
34023 EVT InVT = N0.getValueType();
34026 if (SDValue DivRem8 = getDivRem8(N, DAG))
34029 if (!DCI.isBeforeLegalizeOps()) {
34030 if (InVT == MVT::i1) {
34031 SDValue Zero = DAG.getConstant(0, DL, VT);
34032 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34033 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
34038 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34039 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34040 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34041 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34042 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34043 // sext (xor Bool, -1) --> sub (zext Bool), 1
34044 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34045 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34048 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34051 if (Subtarget.hasAVX() && VT.is256BitVector())
34052 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34055 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34061 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34062 const X86Subtarget &Subtarget) {
34064 EVT VT = N->getValueType(0);
34066 // Let legalize expand this if it isn't a legal type yet.
34067 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34070 EVT ScalarVT = VT.getScalarType();
34071 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34074 SDValue A = N->getOperand(0);
34075 SDValue B = N->getOperand(1);
34076 SDValue C = N->getOperand(2);
34078 auto invertIfNegative = [](SDValue &V) {
34079 if (SDValue NegVal = isFNEG(V.getNode())) {
34086 // Do not convert the passthru input of scalar intrinsics.
34087 // FIXME: We could allow negations of the lower element only.
34088 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34089 bool NegB = invertIfNegative(B);
34090 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34092 // Negative multiplication when NegA xor NegB
34093 bool NegMul = (NegA != NegB);
34095 unsigned NewOpcode;
34097 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34099 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34102 if (N->getOpcode() == X86ISD::FMADD_RND) {
34103 switch (NewOpcode) {
34104 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34105 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34106 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34107 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34109 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34110 switch (NewOpcode) {
34111 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34112 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34113 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34114 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34116 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34117 switch (NewOpcode) {
34118 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34119 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34120 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34121 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34124 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34125 "Unexpected opcode!");
34126 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34129 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34132 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34133 TargetLowering::DAGCombinerInfo &DCI,
34134 const X86Subtarget &Subtarget) {
34135 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34136 // (and (i32 x86isd::setcc_carry), 1)
34137 // This eliminates the zext. This transformation is necessary because
34138 // ISD::SETCC is always legalized to i8.
34140 SDValue N0 = N->getOperand(0);
34141 EVT VT = N->getValueType(0);
34143 if (N0.getOpcode() == ISD::AND &&
34145 N0.getOperand(0).hasOneUse()) {
34146 SDValue N00 = N0.getOperand(0);
34147 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34148 if (!isOneConstant(N0.getOperand(1)))
34150 return DAG.getNode(ISD::AND, dl, VT,
34151 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34152 N00.getOperand(0), N00.getOperand(1)),
34153 DAG.getConstant(1, dl, VT));
34157 if (N0.getOpcode() == ISD::TRUNCATE &&
34159 N0.getOperand(0).hasOneUse()) {
34160 SDValue N00 = N0.getOperand(0);
34161 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34162 return DAG.getNode(ISD::AND, dl, VT,
34163 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34164 N00.getOperand(0), N00.getOperand(1)),
34165 DAG.getConstant(1, dl, VT));
34169 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34172 if (VT.is256BitVector())
34173 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34176 if (SDValue DivRem8 = getDivRem8(N, DAG))
34179 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34182 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34188 /// Try to map a 128-bit or larger integer comparison to vector instructions
34189 /// before type legalization splits it up into chunks.
34190 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34191 const X86Subtarget &Subtarget) {
34192 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34193 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34195 // We're looking for an oversized integer equality comparison, but ignore a
34196 // comparison with zero because that gets special treatment in EmitTest().
34197 SDValue X = SetCC->getOperand(0);
34198 SDValue Y = SetCC->getOperand(1);
34199 EVT OpVT = X.getValueType();
34200 unsigned OpSize = OpVT.getSizeInBits();
34201 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34204 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34205 // TODO: Add support for AVX-512.
34206 EVT VT = SetCC->getValueType(0);
34208 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34209 (OpSize == 256 && Subtarget.hasAVX2())) {
34210 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34211 SDValue VecX = DAG.getBitcast(VecVT, X);
34212 SDValue VecY = DAG.getBitcast(VecVT, Y);
34214 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34215 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34216 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34217 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34218 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34219 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34220 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34221 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34223 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34229 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34230 const X86Subtarget &Subtarget) {
34231 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34232 SDValue LHS = N->getOperand(0);
34233 SDValue RHS = N->getOperand(1);
34234 EVT VT = N->getValueType(0);
34237 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34238 EVT OpVT = LHS.getValueType();
34239 // 0-x == y --> x+y == 0
34240 // 0-x != y --> x+y != 0
34241 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34243 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34244 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34246 // x == 0-y --> x+y == 0
34247 // x != 0-y --> x+y != 0
34248 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34250 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34251 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34254 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34258 if (VT.getScalarType() == MVT::i1 &&
34259 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34261 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34262 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34263 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34265 if (!IsSEXT0 || !IsVZero1) {
34266 // Swap the operands and update the condition code.
34267 std::swap(LHS, RHS);
34268 CC = ISD::getSetCCSwappedOperands(CC);
34270 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34271 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34272 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34275 if (IsSEXT0 && IsVZero1) {
34276 assert(VT == LHS.getOperand(0).getValueType() &&
34277 "Uexpected operand type");
34278 if (CC == ISD::SETGT)
34279 return DAG.getConstant(0, DL, VT);
34280 if (CC == ISD::SETLE)
34281 return DAG.getConstant(1, DL, VT);
34282 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34283 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34285 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34286 "Unexpected condition code!");
34287 return LHS.getOperand(0);
34291 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34292 // to avoid scalarization via legalization because v4i32 is not a legal type.
34293 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34294 LHS.getValueType() == MVT::v4f32)
34295 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34300 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34302 // Gather and Scatter instructions use k-registers for masks. The type of
34303 // the masks is v*i1. So the mask will be truncated anyway.
34304 // The SIGN_EXTEND_INREG my be dropped.
34305 SDValue Mask = N->getOperand(2);
34306 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34307 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34308 NewOps[2] = Mask.getOperand(0);
34309 DAG.UpdateNodeOperands(N, NewOps);
34314 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34315 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34316 const X86Subtarget &Subtarget) {
34318 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34319 SDValue EFLAGS = N->getOperand(1);
34321 // Try to simplify the EFLAGS and condition code operands.
34322 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34323 return getSETCC(CC, Flags, DL, DAG);
34328 /// Optimize branch condition evaluation.
34329 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34330 const X86Subtarget &Subtarget) {
34332 SDValue EFLAGS = N->getOperand(3);
34333 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34335 // Try to simplify the EFLAGS and condition code operands.
34336 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34337 // RAUW them under us.
34338 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34339 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34340 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34341 N->getOperand(1), Cond, Flags);
34347 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34348 SelectionDAG &DAG) {
34349 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34350 // optimize away operation when it's from a constant.
34352 // The general transformation is:
34353 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34354 // AND(VECTOR_CMP(x,y), constant2)
34355 // constant2 = UNARYOP(constant)
34357 // Early exit if this isn't a vector operation, the operand of the
34358 // unary operation isn't a bitwise AND, or if the sizes of the operations
34359 // aren't the same.
34360 EVT VT = N->getValueType(0);
34361 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34362 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34363 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34366 // Now check that the other operand of the AND is a constant. We could
34367 // make the transformation for non-constant splats as well, but it's unclear
34368 // that would be a benefit as it would not eliminate any operations, just
34369 // perform one more step in scalar code before moving to the vector unit.
34370 if (BuildVectorSDNode *BV =
34371 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34372 // Bail out if the vector isn't a constant.
34373 if (!BV->isConstant())
34376 // Everything checks out. Build up the new and improved node.
34378 EVT IntVT = BV->getValueType(0);
34379 // Create a new constant of the appropriate type for the transformed
34381 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34382 // The AND node needs bitcasts to/from an integer vector type around it.
34383 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34384 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34385 N->getOperand(0)->getOperand(0), MaskConst);
34386 SDValue Res = DAG.getBitcast(VT, NewAnd);
34393 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34394 const X86Subtarget &Subtarget) {
34395 SDValue Op0 = N->getOperand(0);
34396 EVT VT = N->getValueType(0);
34397 EVT InVT = Op0.getValueType();
34398 EVT InSVT = InVT.getScalarType();
34399 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34401 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34402 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34403 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34405 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34406 InVT.getVectorNumElements());
34407 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34409 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34410 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34412 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34415 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34416 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34417 // the optimization here.
34418 if (DAG.SignBitIsZero(Op0))
34419 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34424 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34425 const X86Subtarget &Subtarget) {
34426 // First try to optimize away the conversion entirely when it's
34427 // conditionally from a constant. Vectors only.
34428 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34431 // Now move on to more general possibilities.
34432 SDValue Op0 = N->getOperand(0);
34433 EVT VT = N->getValueType(0);
34434 EVT InVT = Op0.getValueType();
34435 EVT InSVT = InVT.getScalarType();
34437 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34438 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34439 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34440 if (InVT.isVector() &&
34441 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34442 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34444 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34445 InVT.getVectorNumElements());
34446 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34447 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34450 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34451 // vectors and scalars, see if we know that the upper bits are all the sign
34452 // bit, in which case we can truncate the input to i32 and convert from that.
34453 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34454 unsigned BitWidth = InVT.getScalarSizeInBits();
34455 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34456 if (NumSignBits >= (BitWidth - 31)) {
34457 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34458 if (InVT.isVector())
34459 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34460 InVT.getVectorNumElements());
34462 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34463 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34467 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34468 // a 32-bit target where SSE doesn't support i64->FP operations.
34469 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34470 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34471 EVT LdVT = Ld->getValueType(0);
34473 // This transformation is not supported if the result type is f16 or f128.
34474 if (VT == MVT::f16 || VT == MVT::f128)
34477 if (!Ld->isVolatile() && !VT.isVector() &&
34478 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34479 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34480 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34481 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34482 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34489 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34490 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34491 X86TargetLowering::DAGCombinerInfo &DCI) {
34492 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34493 // the result is either zero or one (depending on the input carry bit).
34494 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34495 if (X86::isZeroNode(N->getOperand(0)) &&
34496 X86::isZeroNode(N->getOperand(1)) &&
34497 // We don't have a good way to replace an EFLAGS use, so only do this when
34499 SDValue(N, 1).use_empty()) {
34501 EVT VT = N->getValueType(0);
34502 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34503 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34504 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34505 DAG.getConstant(X86::COND_B, DL,
34508 DAG.getConstant(1, DL, VT));
34509 return DCI.CombineTo(N, Res1, CarryOut);
34515 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34516 /// which is more useful than 0/1 in some cases.
34517 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34519 // "Condition code B" is also known as "the carry flag" (CF).
34520 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34521 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34522 MVT VT = N->getSimpleValueType(0);
34524 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34526 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34527 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34530 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34531 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34532 /// with CMP+{ADC, SBB}.
34533 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34534 bool IsSub = N->getOpcode() == ISD::SUB;
34535 SDValue X = N->getOperand(0);
34536 SDValue Y = N->getOperand(1);
34538 // If this is an add, canonicalize a zext operand to the RHS.
34539 // TODO: Incomplete? What if both sides are zexts?
34540 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34541 Y.getOpcode() != ISD::ZERO_EXTEND)
34544 // Look through a one-use zext.
34545 bool PeekedThroughZext = false;
34546 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34547 Y = Y.getOperand(0);
34548 PeekedThroughZext = true;
34551 // If this is an add, canonicalize a setcc operand to the RHS.
34552 // TODO: Incomplete? What if both sides are setcc?
34553 // TODO: Should we allow peeking through a zext of the other operand?
34554 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34555 Y.getOpcode() != X86ISD::SETCC)
34558 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34562 EVT VT = N->getValueType(0);
34563 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34565 if (CC == X86::COND_B) {
34566 // X + SETB Z --> X + (mask SBB Z, Z)
34567 // X - SETB Z --> X - (mask SBB Z, Z)
34568 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34569 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34570 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34571 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34572 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34575 if (CC == X86::COND_A) {
34576 SDValue EFLAGS = Y->getOperand(1);
34577 // Try to convert COND_A into COND_B in an attempt to facilitate
34578 // materializing "setb reg".
34580 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34581 // cannot take an immediate as its first operand.
34583 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34584 EFLAGS.getValueType().isInteger() &&
34585 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34586 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34587 EFLAGS.getNode()->getVTList(),
34588 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34589 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34590 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34591 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34592 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34593 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34597 if (CC != X86::COND_E && CC != X86::COND_NE)
34600 SDValue Cmp = Y.getOperand(1);
34601 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34602 !X86::isZeroNode(Cmp.getOperand(1)) ||
34603 !Cmp.getOperand(0).getValueType().isInteger())
34606 // (cmp Z, 1) sets the carry flag if Z is 0.
34607 SDValue Z = Cmp.getOperand(0);
34608 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34609 DAG.getConstant(1, DL, Z.getValueType()));
34611 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34613 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34614 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34615 if (CC == X86::COND_NE)
34616 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34617 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34619 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34620 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34621 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34622 DAG.getConstant(0, DL, VT), NewCmp);
34625 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34626 const X86Subtarget &Subtarget) {
34627 SDValue MulOp = N->getOperand(0);
34628 SDValue Phi = N->getOperand(1);
34630 if (MulOp.getOpcode() != ISD::MUL)
34631 std::swap(MulOp, Phi);
34632 if (MulOp.getOpcode() != ISD::MUL)
34636 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34639 EVT VT = N->getValueType(0);
34641 unsigned RegSize = 128;
34642 if (Subtarget.hasBWI())
34644 else if (Subtarget.hasAVX2())
34646 unsigned VectorSize = VT.getVectorNumElements() * 16;
34647 // If the vector size is less than 128, or greater than the supported RegSize,
34648 // do not use PMADD.
34649 if (VectorSize < 128 || VectorSize > RegSize)
34653 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34654 VT.getVectorNumElements());
34655 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34656 VT.getVectorNumElements() / 2);
34658 // Shrink the operands of mul.
34659 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34660 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34662 // Madd vector size is half of the original vector size
34663 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34664 // Fill the rest of the output with 0
34665 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34666 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34667 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34670 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34671 const X86Subtarget &Subtarget) {
34673 EVT VT = N->getValueType(0);
34674 SDValue Op0 = N->getOperand(0);
34675 SDValue Op1 = N->getOperand(1);
34677 // TODO: There's nothing special about i32, any integer type above i16 should
34678 // work just as well.
34679 if (!VT.isVector() || !VT.isSimple() ||
34680 !(VT.getVectorElementType() == MVT::i32))
34683 unsigned RegSize = 128;
34684 if (Subtarget.hasBWI())
34686 else if (Subtarget.hasAVX2())
34689 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34690 // TODO: We should be able to handle larger vectors by splitting them before
34691 // feeding them into several SADs, and then reducing over those.
34692 if (VT.getSizeInBits() / 4 > RegSize)
34695 // We know N is a reduction add, which means one of its operands is a phi.
34696 // To match SAD, we need the other operand to be a vector select.
34697 SDValue SelectOp, Phi;
34698 if (Op0.getOpcode() == ISD::VSELECT) {
34701 } else if (Op1.getOpcode() == ISD::VSELECT) {
34707 // Check whether we have an abs-diff pattern feeding into the select.
34708 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34711 // SAD pattern detected. Now build a SAD instruction and an addition for
34712 // reduction. Note that the number of elements of the result of SAD is less
34713 // than the number of elements of its input. Therefore, we could only update
34714 // part of elements in the reduction vector.
34715 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34717 // The output of PSADBW is a vector of i64.
34718 // We need to turn the vector of i64 into a vector of i32.
34719 // If the reduction vector is at least as wide as the psadbw result, just
34720 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34722 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34723 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34724 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34726 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34728 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34729 // Update part of elements of the reduction vector. This is done by first
34730 // extracting a sub-vector from it, updating this sub-vector, and inserting
34732 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34733 DAG.getIntPtrConstant(0, DL));
34734 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34735 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34736 DAG.getIntPtrConstant(0, DL));
34738 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34741 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34742 const X86Subtarget &Subtarget) {
34743 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
34744 if (Flags->hasVectorReduction()) {
34745 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34747 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34750 EVT VT = N->getValueType(0);
34751 SDValue Op0 = N->getOperand(0);
34752 SDValue Op1 = N->getOperand(1);
34754 // Try to synthesize horizontal adds from adds of shuffles.
34755 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34756 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34757 isHorizontalBinOp(Op0, Op1, true))
34758 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34760 return combineAddOrSubToADCOrSBB(N, DAG);
34763 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34764 const X86Subtarget &Subtarget) {
34765 SDValue Op0 = N->getOperand(0);
34766 SDValue Op1 = N->getOperand(1);
34768 // X86 can't encode an immediate LHS of a sub. See if we can push the
34769 // negation into a preceding instruction.
34770 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34771 // If the RHS of the sub is a XOR with one use and a constant, invert the
34772 // immediate. Then add one to the LHS of the sub so we can turn
34773 // X-Y -> X+~Y+1, saving one register.
34774 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34775 isa<ConstantSDNode>(Op1.getOperand(1))) {
34776 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34777 EVT VT = Op0.getValueType();
34778 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34780 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34781 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34782 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34786 // Try to synthesize horizontal subs from subs of shuffles.
34787 EVT VT = N->getValueType(0);
34788 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34789 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34790 isHorizontalBinOp(Op0, Op1, false))
34791 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34793 return combineAddOrSubToADCOrSBB(N, DAG);
34796 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34797 TargetLowering::DAGCombinerInfo &DCI,
34798 const X86Subtarget &Subtarget) {
34799 if (DCI.isBeforeLegalize())
34803 unsigned Opcode = N->getOpcode();
34804 MVT VT = N->getSimpleValueType(0);
34805 MVT SVT = VT.getVectorElementType();
34806 unsigned NumElts = VT.getVectorNumElements();
34807 unsigned EltSizeInBits = SVT.getSizeInBits();
34809 SDValue Op = N->getOperand(0);
34810 MVT OpVT = Op.getSimpleValueType();
34811 MVT OpEltVT = OpVT.getVectorElementType();
34812 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34813 unsigned InputBits = OpEltSizeInBits * NumElts;
34815 // Perform any constant folding.
34816 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34818 SmallVector<APInt, 64> EltBits;
34819 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34820 APInt Undefs(NumElts, 0);
34821 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34823 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34824 for (unsigned i = 0; i != NumElts; ++i) {
34825 if (UndefElts[i]) {
34829 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34830 : EltBits[i].sextOrTrunc(EltSizeInBits);
34832 return getConstVector(Vals, Undefs, VT, DAG, DL);
34835 // (vzext (bitcast (vzext (x)) -> (vzext x)
34836 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34837 SDValue V = peekThroughBitcasts(Op);
34838 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34839 MVT InnerVT = V.getSimpleValueType();
34840 MVT InnerEltVT = InnerVT.getVectorElementType();
34842 // If the element sizes match exactly, we can just do one larger vzext. This
34843 // is always an exact type match as vzext operates on integer types.
34844 if (OpEltVT == InnerEltVT) {
34845 assert(OpVT == InnerVT && "Types must match for vzext!");
34846 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34849 // The only other way we can combine them is if only a single element of the
34850 // inner vzext is used in the input to the outer vzext.
34851 if (InnerEltVT.getSizeInBits() < InputBits)
34854 // In this case, the inner vzext is completely dead because we're going to
34855 // only look at bits inside of the low element. Just do the outer vzext on
34856 // a bitcast of the input to the inner.
34857 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34860 // Check if we can bypass extracting and re-inserting an element of an input
34861 // vector. Essentially:
34862 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34863 // TODO: Add X86ISD::VSEXT support
34864 if (Opcode == X86ISD::VZEXT &&
34865 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34866 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34867 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34868 SDValue ExtractedV = V.getOperand(0);
34869 SDValue OrigV = ExtractedV.getOperand(0);
34870 if (isNullConstant(ExtractedV.getOperand(1))) {
34871 MVT OrigVT = OrigV.getSimpleValueType();
34872 // Extract a subvector if necessary...
34873 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34874 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34875 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34876 OrigVT.getVectorNumElements() / Ratio);
34877 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34878 DAG.getIntPtrConstant(0, DL));
34880 Op = DAG.getBitcast(OpVT, OrigV);
34881 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34888 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34889 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34890 const X86Subtarget &Subtarget) {
34891 SDValue Chain = N->getOperand(0);
34892 SDValue LHS = N->getOperand(1);
34893 SDValue RHS = N->getOperand(2);
34894 MVT VT = RHS.getSimpleValueType();
34897 auto *C = dyn_cast<ConstantSDNode>(RHS);
34898 if (!C || C->getZExtValue() != 1)
34901 RHS = DAG.getConstant(-1, DL, VT);
34902 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34903 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34904 DAG.getVTList(MVT::i32, MVT::Other),
34905 {Chain, LHS, RHS}, VT, MMO);
34908 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34909 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34910 SDValue Op0 = N->getOperand(0);
34911 SDValue Op1 = N->getOperand(1);
34913 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34916 EVT VT = N->getValueType(0);
34919 return DAG.getNode(X86ISD::TESTM, DL, VT,
34920 Op0->getOperand(0), Op0->getOperand(1));
34923 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
34924 const X86Subtarget &Subtarget) {
34925 MVT VT = N->getSimpleValueType(0);
34928 if (N->getOperand(0) == N->getOperand(1)) {
34929 if (N->getOpcode() == X86ISD::PCMPEQ)
34930 return getOnesVector(VT, DAG, DL);
34931 if (N->getOpcode() == X86ISD::PCMPGT)
34932 return getZeroVector(VT, Subtarget, DAG, DL);
34938 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
34939 TargetLowering::DAGCombinerInfo &DCI,
34940 const X86Subtarget &Subtarget) {
34941 if (DCI.isBeforeLegalizeOps())
34945 SDValue Vec = N->getOperand(0);
34946 SDValue SubVec = N->getOperand(1);
34947 SDValue Idx = N->getOperand(2);
34949 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
34950 MVT OpVT = N->getSimpleValueType(0);
34951 MVT SubVecVT = SubVec.getSimpleValueType();
34953 // If this is an insert of an extract, combine to a shuffle. Don't do this
34954 // if the insert or extract can be represented with a subvector operation.
34955 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34956 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
34957 (IdxVal != 0 || !Vec.isUndef())) {
34958 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
34959 if (ExtIdxVal != 0) {
34960 int VecNumElts = OpVT.getVectorNumElements();
34961 int SubVecNumElts = SubVecVT.getVectorNumElements();
34962 SmallVector<int, 64> Mask(VecNumElts);
34963 // First create an identity shuffle mask.
34964 for (int i = 0; i != VecNumElts; ++i)
34966 // Now insert the extracted portion.
34967 for (int i = 0; i != SubVecNumElts; ++i)
34968 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
34970 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
34974 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
34976 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34977 // (load16 addr + 16), Elts/2)
34980 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34981 // (load32 addr + 32), Elts/2)
34983 // or a 16-byte or 32-byte broadcast:
34984 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34985 // (load16 addr), Elts/2)
34986 // --> X86SubVBroadcast(load16 addr)
34988 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34989 // (load32 addr), Elts/2)
34990 // --> X86SubVBroadcast(load32 addr)
34991 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
34992 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
34993 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
34994 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
34995 if (Idx2 && Idx2->getZExtValue() == 0) {
34996 SDValue SubVec2 = Vec.getOperand(1);
34997 // If needed, look through bitcasts to get to the load.
34998 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35000 unsigned Alignment = FirstLd->getAlignment();
35001 unsigned AS = FirstLd->getAddressSpace();
35002 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35003 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35004 OpVT, AS, Alignment, &Fast) && Fast) {
35005 SDValue Ops[] = {SubVec2, SubVec};
35006 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35010 // If lower/upper loads are the same and the only users of the load, then
35011 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35012 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35013 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35014 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35015 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35018 // If this is subv_broadcast insert into both halves, use a larger
35020 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35021 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35022 SubVec.getOperand(0));
35031 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35032 DAGCombinerInfo &DCI) const {
35033 SelectionDAG &DAG = DCI.DAG;
35034 switch (N->getOpcode()) {
35036 case ISD::EXTRACT_VECTOR_ELT:
35037 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35038 case X86ISD::PEXTRW:
35039 case X86ISD::PEXTRB:
35040 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35041 case ISD::INSERT_SUBVECTOR:
35042 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35045 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35046 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
35047 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35048 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35049 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35050 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35051 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35054 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35055 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35056 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35057 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35058 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35059 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35060 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35061 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35062 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35063 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35065 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35066 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35067 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35068 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35069 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35070 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35072 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35074 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35076 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35077 case X86ISD::BT: return combineBT(N, DAG, DCI);
35078 case ISD::ANY_EXTEND:
35079 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35080 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35081 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35082 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35083 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35084 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35085 case X86ISD::VSHLI:
35086 case X86ISD::VSRAI:
35087 case X86ISD::VSRLI:
35088 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35089 case ISD::SIGN_EXTEND_VECTOR_INREG:
35090 case ISD::ZERO_EXTEND_VECTOR_INREG:
35091 case X86ISD::VSEXT:
35092 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35093 case X86ISD::PINSRB:
35094 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35095 case X86ISD::SHUFP: // Handle all target specific shuffles
35096 case X86ISD::INSERTPS:
35097 case X86ISD::PALIGNR:
35098 case X86ISD::VSHLDQ:
35099 case X86ISD::VSRLDQ:
35100 case X86ISD::BLENDI:
35101 case X86ISD::UNPCKH:
35102 case X86ISD::UNPCKL:
35103 case X86ISD::MOVHLPS:
35104 case X86ISD::MOVLHPS:
35105 case X86ISD::PSHUFB:
35106 case X86ISD::PSHUFD:
35107 case X86ISD::PSHUFHW:
35108 case X86ISD::PSHUFLW:
35109 case X86ISD::MOVSHDUP:
35110 case X86ISD::MOVSLDUP:
35111 case X86ISD::MOVDDUP:
35112 case X86ISD::MOVSS:
35113 case X86ISD::MOVSD:
35114 case X86ISD::VPPERM:
35115 case X86ISD::VPERMI:
35116 case X86ISD::VPERMV:
35117 case X86ISD::VPERMV3:
35118 case X86ISD::VPERMIV3:
35119 case X86ISD::VPERMIL2:
35120 case X86ISD::VPERMILPI:
35121 case X86ISD::VPERMILPV:
35122 case X86ISD::VPERM2X128:
35123 case X86ISD::VZEXT_MOVL:
35124 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35125 case X86ISD::FMADD:
35126 case X86ISD::FMADD_RND:
35127 case X86ISD::FMADDS1_RND:
35128 case X86ISD::FMADDS3_RND:
35129 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35131 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35132 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35133 case X86ISD::TESTM: return combineTestM(N, DAG);
35134 case X86ISD::PCMPEQ:
35135 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35141 /// Return true if the target has native support for the specified value type
35142 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35143 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35144 /// some i16 instructions are slow.
35145 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35146 if (!isTypeLegal(VT))
35148 if (VT != MVT::i16)
35155 case ISD::SIGN_EXTEND:
35156 case ISD::ZERO_EXTEND:
35157 case ISD::ANY_EXTEND:
35170 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35171 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35172 /// we don't adjust the stack we clobber the first frame index.
35173 /// See X86InstrInfo::copyPhysReg.
35174 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
35175 MachineFunction *MF) const {
35176 const MachineRegisterInfo &MRI = MF->getRegInfo();
35178 return any_of(MRI.reg_instructions(X86::EFLAGS),
35179 [](const MachineInstr &RI) { return RI.isCopy(); });
35182 /// This method query the target whether it is beneficial for dag combiner to
35183 /// promote the specified node. If true, it should return the desired promotion
35184 /// type by reference.
35185 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35186 EVT VT = Op.getValueType();
35187 if (VT != MVT::i16)
35190 bool Promote = false;
35191 bool Commute = false;
35192 switch (Op.getOpcode()) {
35194 case ISD::SIGN_EXTEND:
35195 case ISD::ZERO_EXTEND:
35196 case ISD::ANY_EXTEND:
35201 SDValue N0 = Op.getOperand(0);
35202 // Look out for (store (shl (load), x)).
35203 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35216 SDValue N0 = Op.getOperand(0);
35217 SDValue N1 = Op.getOperand(1);
35218 if (!Commute && MayFoldLoad(N1))
35220 // Avoid disabling potential load folding opportunities.
35221 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35223 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35233 //===----------------------------------------------------------------------===//
35234 // X86 Inline Assembly Support
35235 //===----------------------------------------------------------------------===//
35237 // Helper to match a string separated by whitespace.
35238 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35239 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35241 for (StringRef Piece : Pieces) {
35242 if (!S.startswith(Piece)) // Check if the piece matches.
35245 S = S.substr(Piece.size());
35246 StringRef::size_type Pos = S.find_first_not_of(" \t");
35247 if (Pos == 0) // We matched a prefix.
35256 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35258 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35259 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35260 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35261 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35263 if (AsmPieces.size() == 3)
35265 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35272 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35273 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35275 const std::string &AsmStr = IA->getAsmString();
35277 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35278 if (!Ty || Ty->getBitWidth() % 16 != 0)
35281 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35282 SmallVector<StringRef, 4> AsmPieces;
35283 SplitString(AsmStr, AsmPieces, ";\n");
35285 switch (AsmPieces.size()) {
35286 default: return false;
35288 // FIXME: this should verify that we are targeting a 486 or better. If not,
35289 // we will turn this bswap into something that will be lowered to logical
35290 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35291 // lower so don't worry about this.
35293 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35294 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35295 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35296 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35297 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35298 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35299 // No need to check constraints, nothing other than the equivalent of
35300 // "=r,0" would be valid here.
35301 return IntrinsicLowering::LowerToByteSwap(CI);
35304 // rorw $$8, ${0:w} --> llvm.bswap.i16
35305 if (CI->getType()->isIntegerTy(16) &&
35306 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35307 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35308 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35310 StringRef ConstraintsStr = IA->getConstraintString();
35311 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35312 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35313 if (clobbersFlagRegisters(AsmPieces))
35314 return IntrinsicLowering::LowerToByteSwap(CI);
35318 if (CI->getType()->isIntegerTy(32) &&
35319 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35320 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35321 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35322 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35324 StringRef ConstraintsStr = IA->getConstraintString();
35325 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35326 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35327 if (clobbersFlagRegisters(AsmPieces))
35328 return IntrinsicLowering::LowerToByteSwap(CI);
35331 if (CI->getType()->isIntegerTy(64)) {
35332 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35333 if (Constraints.size() >= 2 &&
35334 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35335 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35336 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35337 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35338 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35339 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35340 return IntrinsicLowering::LowerToByteSwap(CI);
35348 /// Given a constraint letter, return the type of constraint for this target.
35349 X86TargetLowering::ConstraintType
35350 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35351 if (Constraint.size() == 1) {
35352 switch (Constraint[0]) {
35364 return C_RegisterClass;
35365 case 'k': // AVX512 masking registers.
35389 else if (Constraint.size() == 2) {
35390 switch (Constraint[0]) {
35394 switch (Constraint[1]) {
35402 return TargetLowering::getConstraintType(Constraint);
35405 /// Examine constraint type and operand type and determine a weight value.
35406 /// This object must already have been set up with the operand type
35407 /// and the current alternative constraint selected.
35408 TargetLowering::ConstraintWeight
35409 X86TargetLowering::getSingleConstraintMatchWeight(
35410 AsmOperandInfo &info, const char *constraint) const {
35411 ConstraintWeight weight = CW_Invalid;
35412 Value *CallOperandVal = info.CallOperandVal;
35413 // If we don't have a value, we can't do a match,
35414 // but allow it at the lowest weight.
35415 if (!CallOperandVal)
35417 Type *type = CallOperandVal->getType();
35418 // Look at the constraint type.
35419 switch (*constraint) {
35421 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35432 if (CallOperandVal->getType()->isIntegerTy())
35433 weight = CW_SpecificReg;
35438 if (type->isFloatingPointTy())
35439 weight = CW_SpecificReg;
35442 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35443 weight = CW_SpecificReg;
35446 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35447 if (constraint[1] == 'k') {
35448 // Support for 'Yk' (similarly to the 'k' variant below).
35449 weight = CW_SpecificReg;
35452 // Else fall through (handle "Y" constraint).
35455 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35456 weight = CW_Register;
35459 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35460 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35461 weight = CW_Register;
35464 // Enable conditional vector operations using %k<#> registers.
35465 weight = CW_SpecificReg;
35468 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35469 if (C->getZExtValue() <= 31)
35470 weight = CW_Constant;
35474 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35475 if (C->getZExtValue() <= 63)
35476 weight = CW_Constant;
35480 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35481 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35482 weight = CW_Constant;
35486 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35487 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35488 weight = CW_Constant;
35492 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35493 if (C->getZExtValue() <= 3)
35494 weight = CW_Constant;
35498 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35499 if (C->getZExtValue() <= 0xff)
35500 weight = CW_Constant;
35505 if (isa<ConstantFP>(CallOperandVal)) {
35506 weight = CW_Constant;
35510 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35511 if ((C->getSExtValue() >= -0x80000000LL) &&
35512 (C->getSExtValue() <= 0x7fffffffLL))
35513 weight = CW_Constant;
35517 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35518 if (C->getZExtValue() <= 0xffffffff)
35519 weight = CW_Constant;
35526 /// Try to replace an X constraint, which matches anything, with another that
35527 /// has more specific requirements based on the type of the corresponding
35529 const char *X86TargetLowering::
35530 LowerXConstraint(EVT ConstraintVT) const {
35531 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35532 // 'f' like normal targets.
35533 if (ConstraintVT.isFloatingPoint()) {
35534 if (Subtarget.hasSSE2())
35536 if (Subtarget.hasSSE1())
35540 return TargetLowering::LowerXConstraint(ConstraintVT);
35543 /// Lower the specified operand into the Ops vector.
35544 /// If it is invalid, don't add anything to Ops.
35545 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35546 std::string &Constraint,
35547 std::vector<SDValue>&Ops,
35548 SelectionDAG &DAG) const {
35551 // Only support length 1 constraints for now.
35552 if (Constraint.length() > 1) return;
35554 char ConstraintLetter = Constraint[0];
35555 switch (ConstraintLetter) {
35558 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35559 if (C->getZExtValue() <= 31) {
35560 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35561 Op.getValueType());
35567 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35568 if (C->getZExtValue() <= 63) {
35569 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35570 Op.getValueType());
35576 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35577 if (isInt<8>(C->getSExtValue())) {
35578 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35579 Op.getValueType());
35585 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35586 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35587 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35588 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35589 Op.getValueType());
35595 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35596 if (C->getZExtValue() <= 3) {
35597 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35598 Op.getValueType());
35604 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35605 if (C->getZExtValue() <= 255) {
35606 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35607 Op.getValueType());
35613 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35614 if (C->getZExtValue() <= 127) {
35615 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35616 Op.getValueType());
35622 // 32-bit signed value
35623 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35624 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35625 C->getSExtValue())) {
35626 // Widen to 64 bits here to get it sign extended.
35627 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35630 // FIXME gcc accepts some relocatable values here too, but only in certain
35631 // memory models; it's complicated.
35636 // 32-bit unsigned value
35637 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35638 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35639 C->getZExtValue())) {
35640 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35641 Op.getValueType());
35645 // FIXME gcc accepts some relocatable values here too, but only in certain
35646 // memory models; it's complicated.
35650 // Literal immediates are always ok.
35651 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35652 // Widen to 64 bits here to get it sign extended.
35653 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35657 // In any sort of PIC mode addresses need to be computed at runtime by
35658 // adding in a register or some sort of table lookup. These can't
35659 // be used as immediates.
35660 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35663 // If we are in non-pic codegen mode, we allow the address of a global (with
35664 // an optional displacement) to be used with 'i'.
35665 GlobalAddressSDNode *GA = nullptr;
35666 int64_t Offset = 0;
35668 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35670 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35671 Offset += GA->getOffset();
35673 } else if (Op.getOpcode() == ISD::ADD) {
35674 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35675 Offset += C->getZExtValue();
35676 Op = Op.getOperand(0);
35679 } else if (Op.getOpcode() == ISD::SUB) {
35680 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35681 Offset += -C->getZExtValue();
35682 Op = Op.getOperand(0);
35687 // Otherwise, this isn't something we can handle, reject it.
35691 const GlobalValue *GV = GA->getGlobal();
35692 // If we require an extra load to get this address, as in PIC mode, we
35693 // can't accept it.
35694 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35697 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35698 GA->getValueType(0), Offset);
35703 if (Result.getNode()) {
35704 Ops.push_back(Result);
35707 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35710 /// Check if \p RC is a general purpose register class.
35711 /// I.e., GR* or one of their variant.
35712 static bool isGRClass(const TargetRegisterClass &RC) {
35713 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35714 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35715 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35716 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35717 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35720 /// Check if \p RC is a vector register class.
35721 /// I.e., FR* / VR* or one of their variant.
35722 static bool isFRClass(const TargetRegisterClass &RC) {
35723 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35724 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35725 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35726 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35727 RC.hasSuperClassEq(&X86::VR512RegClass);
35730 std::pair<unsigned, const TargetRegisterClass *>
35731 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35732 StringRef Constraint,
35734 // First, see if this is a constraint that directly corresponds to an LLVM
35736 if (Constraint.size() == 1) {
35737 // GCC Constraint Letters
35738 switch (Constraint[0]) {
35740 // TODO: Slight differences here in allocation order and leaving
35741 // RIP in the class. Do they matter any more here than they do
35742 // in the normal allocation?
35744 if (Subtarget.hasAVX512()) {
35745 // Only supported in AVX512 or later.
35746 switch (VT.SimpleTy) {
35749 return std::make_pair(0U, &X86::VK32RegClass);
35751 return std::make_pair(0U, &X86::VK16RegClass);
35753 return std::make_pair(0U, &X86::VK8RegClass);
35755 return std::make_pair(0U, &X86::VK1RegClass);
35757 return std::make_pair(0U, &X86::VK64RegClass);
35761 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35762 if (Subtarget.is64Bit()) {
35763 if (VT == MVT::i32 || VT == MVT::f32)
35764 return std::make_pair(0U, &X86::GR32RegClass);
35765 if (VT == MVT::i16)
35766 return std::make_pair(0U, &X86::GR16RegClass);
35767 if (VT == MVT::i8 || VT == MVT::i1)
35768 return std::make_pair(0U, &X86::GR8RegClass);
35769 if (VT == MVT::i64 || VT == MVT::f64)
35770 return std::make_pair(0U, &X86::GR64RegClass);
35773 // 32-bit fallthrough
35774 case 'Q': // Q_REGS
35775 if (VT == MVT::i32 || VT == MVT::f32)
35776 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35777 if (VT == MVT::i16)
35778 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35779 if (VT == MVT::i8 || VT == MVT::i1)
35780 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35781 if (VT == MVT::i64)
35782 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35784 case 'r': // GENERAL_REGS
35785 case 'l': // INDEX_REGS
35786 if (VT == MVT::i8 || VT == MVT::i1)
35787 return std::make_pair(0U, &X86::GR8RegClass);
35788 if (VT == MVT::i16)
35789 return std::make_pair(0U, &X86::GR16RegClass);
35790 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35791 return std::make_pair(0U, &X86::GR32RegClass);
35792 return std::make_pair(0U, &X86::GR64RegClass);
35793 case 'R': // LEGACY_REGS
35794 if (VT == MVT::i8 || VT == MVT::i1)
35795 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35796 if (VT == MVT::i16)
35797 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35798 if (VT == MVT::i32 || !Subtarget.is64Bit())
35799 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35800 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35801 case 'f': // FP Stack registers.
35802 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35803 // value to the correct fpstack register class.
35804 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35805 return std::make_pair(0U, &X86::RFP32RegClass);
35806 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35807 return std::make_pair(0U, &X86::RFP64RegClass);
35808 return std::make_pair(0U, &X86::RFP80RegClass);
35809 case 'y': // MMX_REGS if MMX allowed.
35810 if (!Subtarget.hasMMX()) break;
35811 return std::make_pair(0U, &X86::VR64RegClass);
35812 case 'Y': // SSE_REGS if SSE2 allowed
35813 if (!Subtarget.hasSSE2()) break;
35816 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35817 if (!Subtarget.hasSSE1()) break;
35818 bool VConstraint = (Constraint[0] == 'v');
35820 switch (VT.SimpleTy) {
35822 // Scalar SSE types.
35825 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35826 return std::make_pair(0U, &X86::FR32XRegClass);
35827 return std::make_pair(0U, &X86::FR32RegClass);
35830 if (VConstraint && Subtarget.hasVLX())
35831 return std::make_pair(0U, &X86::FR64XRegClass);
35832 return std::make_pair(0U, &X86::FR64RegClass);
35833 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35841 if (VConstraint && Subtarget.hasVLX())
35842 return std::make_pair(0U, &X86::VR128XRegClass);
35843 return std::make_pair(0U, &X86::VR128RegClass);
35851 if (VConstraint && Subtarget.hasVLX())
35852 return std::make_pair(0U, &X86::VR256XRegClass);
35853 return std::make_pair(0U, &X86::VR256RegClass);
35858 return std::make_pair(0U, &X86::VR512RegClass);
35862 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35863 switch (Constraint[1]) {
35867 // This register class doesn't allocate k0 for masked vector operation.
35868 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35869 switch (VT.SimpleTy) {
35872 return std::make_pair(0U, &X86::VK32WMRegClass);
35874 return std::make_pair(0U, &X86::VK16WMRegClass);
35876 return std::make_pair(0U, &X86::VK8WMRegClass);
35878 return std::make_pair(0U, &X86::VK1WMRegClass);
35880 return std::make_pair(0U, &X86::VK64WMRegClass);
35887 // Use the default implementation in TargetLowering to convert the register
35888 // constraint into a member of a register class.
35889 std::pair<unsigned, const TargetRegisterClass*> Res;
35890 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35892 // Not found as a standard register?
35894 // Map st(0) -> st(7) -> ST0
35895 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35896 tolower(Constraint[1]) == 's' &&
35897 tolower(Constraint[2]) == 't' &&
35898 Constraint[3] == '(' &&
35899 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35900 Constraint[5] == ')' &&
35901 Constraint[6] == '}') {
35903 Res.first = X86::FP0+Constraint[4]-'0';
35904 Res.second = &X86::RFP80RegClass;
35908 // GCC allows "st(0)" to be called just plain "st".
35909 if (StringRef("{st}").equals_lower(Constraint)) {
35910 Res.first = X86::FP0;
35911 Res.second = &X86::RFP80RegClass;
35916 if (StringRef("{flags}").equals_lower(Constraint)) {
35917 Res.first = X86::EFLAGS;
35918 Res.second = &X86::CCRRegClass;
35922 // 'A' means [ER]AX + [ER]DX.
35923 if (Constraint == "A") {
35924 if (Subtarget.is64Bit()) {
35925 Res.first = X86::RAX;
35926 Res.second = &X86::GR64_ADRegClass;
35928 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
35929 "Expecting 64, 32 or 16 bit subtarget");
35930 Res.first = X86::EAX;
35931 Res.second = &X86::GR32_ADRegClass;
35938 // Otherwise, check to see if this is a register class of the wrong value
35939 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
35940 // turn into {ax},{dx}.
35941 // MVT::Other is used to specify clobber names.
35942 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
35943 return Res; // Correct type already, nothing to do.
35945 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
35946 // return "eax". This should even work for things like getting 64bit integer
35947 // registers when given an f64 type.
35948 const TargetRegisterClass *Class = Res.second;
35949 // The generic code will match the first register class that contains the
35950 // given register. Thus, based on the ordering of the tablegened file,
35951 // the "plain" GR classes might not come first.
35952 // Therefore, use a helper method.
35953 if (isGRClass(*Class)) {
35954 unsigned Size = VT.getSizeInBits();
35955 if (Size == 1) Size = 8;
35956 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
35958 Res.first = DestReg;
35959 Res.second = Size == 8 ? &X86::GR8RegClass
35960 : Size == 16 ? &X86::GR16RegClass
35961 : Size == 32 ? &X86::GR32RegClass
35962 : &X86::GR64RegClass;
35963 assert(Res.second->contains(Res.first) && "Register in register class");
35965 // No register found/type mismatch.
35967 Res.second = nullptr;
35969 } else if (isFRClass(*Class)) {
35970 // Handle references to XMM physical registers that got mapped into the
35971 // wrong class. This can happen with constraints like {xmm0} where the
35972 // target independent register mapper will just pick the first match it can
35973 // find, ignoring the required type.
35975 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35976 if (VT == MVT::f32 || VT == MVT::i32)
35977 Res.second = &X86::FR32RegClass;
35978 else if (VT == MVT::f64 || VT == MVT::i64)
35979 Res.second = &X86::FR64RegClass;
35980 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
35981 Res.second = &X86::VR128RegClass;
35982 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
35983 Res.second = &X86::VR256RegClass;
35984 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
35985 Res.second = &X86::VR512RegClass;
35987 // Type mismatch and not a clobber: Return an error;
35989 Res.second = nullptr;
35996 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
35997 const AddrMode &AM, Type *Ty,
35998 unsigned AS) const {
35999 // Scaling factors are not free at all.
36000 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36001 // will take 2 allocations in the out of order engine instead of 1
36002 // for plain addressing mode, i.e. inst (reg1).
36004 // vaddps (%rsi,%drx), %ymm0, %ymm1
36005 // Requires two allocations (one for the load, one for the computation)
36007 // vaddps (%rsi), %ymm0, %ymm1
36008 // Requires just 1 allocation, i.e., freeing allocations for other operations
36009 // and having less micro operations to execute.
36011 // For some X86 architectures, this is even worse because for instance for
36012 // stores, the complex addressing mode forces the instruction to use the
36013 // "load" ports instead of the dedicated "store" port.
36014 // E.g., on Haswell:
36015 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36016 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36017 if (isLegalAddressingMode(DL, AM, Ty, AS))
36018 // Scale represents reg2 * scale, thus account for 1
36019 // as soon as we use a second register.
36020 return AM.Scale != 0;
36024 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36025 // Integer division on x86 is expensive. However, when aggressively optimizing
36026 // for code size, we prefer to use a div instruction, as it is usually smaller
36027 // than the alternative sequence.
36028 // The exception to this is vector division. Since x86 doesn't have vector
36029 // integer division, leaving the division as-is is a loss even in terms of
36030 // size, because it will have to be scalarized, while the alternative code
36031 // sequence can be performed in vector form.
36033 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36034 return OptSize && !VT.isVector();
36037 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36038 if (!Subtarget.is64Bit())
36041 // Update IsSplitCSR in X86MachineFunctionInfo.
36042 X86MachineFunctionInfo *AFI =
36043 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36044 AFI->setIsSplitCSR(true);
36047 void X86TargetLowering::insertCopiesSplitCSR(
36048 MachineBasicBlock *Entry,
36049 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36050 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36051 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36055 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36056 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36057 MachineBasicBlock::iterator MBBI = Entry->begin();
36058 for (const MCPhysReg *I = IStart; *I; ++I) {
36059 const TargetRegisterClass *RC = nullptr;
36060 if (X86::GR64RegClass.contains(*I))
36061 RC = &X86::GR64RegClass;
36063 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36065 unsigned NewVR = MRI->createVirtualRegister(RC);
36066 // Create copy from CSR to a virtual register.
36067 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36068 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36069 // nounwind. If we want to generalize this later, we may need to emit
36070 // CFI pseudo-instructions.
36071 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36072 Attribute::NoUnwind) &&
36073 "Function should be nounwind in insertCopiesSplitCSR!");
36074 Entry->addLiveIn(*I);
36075 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36078 // Insert the copy-back instructions right before the terminator.
36079 for (auto *Exit : Exits)
36080 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36081 TII->get(TargetOpcode::COPY), *I)
36086 bool X86TargetLowering::supportSwiftError() const {
36087 return Subtarget.is64Bit();