1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetLowering.h"
57 #include "llvm/Target/TargetOptions.h"
64 #define DEBUG_TYPE "x86-isel"
66 STATISTIC(NumTailCalls, "Number of tail calls");
68 static cl::opt<bool> ExperimentalVectorWideningLegalization(
69 "x86-experimental-vector-widening-legalization", cl::init(false),
70 cl::desc("Enable an experimental vector type legalization through widening "
71 "rather than promotion."),
74 static cl::opt<int> ExperimentalPrefLoopAlignment(
75 "x86-experimental-pref-loop-alignment", cl::init(4),
76 cl::desc("Sets the preferable loop alignment for experiments "
77 "(the last x86-experimental-pref-loop-alignment bits"
78 " of the loop header PC will be 0)."),
81 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
82 const X86Subtarget &STI)
83 : TargetLowering(TM), Subtarget(STI) {
84 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
85 X86ScalarSSEf64 = Subtarget.hasSSE2();
86 X86ScalarSSEf32 = Subtarget.hasSSE1();
87 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
89 // Set up the TargetLowering object.
91 // X86 is weird. It always uses i8 for shift amounts and setcc results.
92 setBooleanContents(ZeroOrOneBooleanContent);
93 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
94 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
96 // For 64-bit, since we have so many registers, use the ILP scheduler.
97 // For 32-bit, use the register pressure specific scheduling.
98 // For Atom, always use ILP scheduling.
99 if (Subtarget.isAtom())
100 setSchedulingPreference(Sched::ILP);
101 else if (Subtarget.is64Bit())
102 setSchedulingPreference(Sched::ILP);
104 setSchedulingPreference(Sched::RegPressure);
105 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
106 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
108 // Bypass expensive divides and use cheaper ones.
109 if (TM.getOptLevel() >= CodeGenOpt::Default) {
110 if (Subtarget.hasSlowDivide32())
111 addBypassSlowDiv(32, 8);
112 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
113 addBypassSlowDiv(64, 32);
116 if (Subtarget.isTargetKnownWindowsMSVC() ||
117 Subtarget.isTargetWindowsItanium()) {
118 // Setup Windows compiler runtime calls.
119 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
120 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
121 setLibcallName(RTLIB::SREM_I64, "_allrem");
122 setLibcallName(RTLIB::UREM_I64, "_aullrem");
123 setLibcallName(RTLIB::MUL_I64, "_allmul");
124 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
125 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
126 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
127 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
128 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
131 if (Subtarget.isTargetDarwin()) {
132 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
133 setUseUnderscoreSetJmp(false);
134 setUseUnderscoreLongJmp(false);
135 } else if (Subtarget.isTargetWindowsGNU()) {
136 // MS runtime is weird: it exports _setjmp, but longjmp!
137 setUseUnderscoreSetJmp(true);
138 setUseUnderscoreLongJmp(false);
140 setUseUnderscoreSetJmp(true);
141 setUseUnderscoreLongJmp(true);
144 // Set up the register classes.
145 addRegisterClass(MVT::i8, &X86::GR8RegClass);
146 addRegisterClass(MVT::i16, &X86::GR16RegClass);
147 addRegisterClass(MVT::i32, &X86::GR32RegClass);
148 if (Subtarget.is64Bit())
149 addRegisterClass(MVT::i64, &X86::GR64RegClass);
151 for (MVT VT : MVT::integer_valuetypes())
152 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
154 // We don't accept any truncstore of integer registers.
155 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
156 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
157 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
158 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
159 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
160 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
162 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
164 // SETOEQ and SETUNE require checking two conditions.
165 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
166 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
167 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
168 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
169 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
170 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
172 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
174 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
178 if (Subtarget.is64Bit()) {
179 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
180 // f32/f64 are legal, f80 is custom.
181 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
184 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
185 } else if (!Subtarget.useSoftFloat()) {
186 // We have an algorithm for SSE2->double, and we turn this into a
187 // 64-bit FILD followed by conditional FADD for other targets.
188 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
189 // We have an algorithm for SSE2, and we turn this into a 64-bit
190 // FILD or VCVTUSI2SS/SD for other targets.
191 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
194 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
196 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
197 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
199 if (!Subtarget.useSoftFloat()) {
200 // SSE has no i16 to fp conversion, only i32.
201 if (X86ScalarSSEf32) {
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 // f32 and f64 cases are Legal, f80 case is not
204 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
206 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
207 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
211 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
214 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
216 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
217 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
219 if (!Subtarget.useSoftFloat()) {
220 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
221 // are Legal, f80 is custom lowered.
222 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
223 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
225 if (X86ScalarSSEf32) {
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 // f32 and f64 cases are Legal, f80 case is not
228 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
230 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
231 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
234 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
235 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
239 // Handle FP_TO_UINT by promoting the destination to a larger signed
241 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
242 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
245 if (Subtarget.is64Bit()) {
246 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
247 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
248 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
251 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
252 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
254 } else if (!Subtarget.useSoftFloat()) {
255 // Since AVX is a superset of SSE3, only check for SSE here.
256 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
257 // Expand FP_TO_UINT into a select.
258 // FIXME: We would like to use a Custom expander here eventually to do
259 // the optimal thing for SSE vs. the default expansion in the legalizer.
260 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
262 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
263 // With SSE3 we can use fisttpll to convert to a signed i64; without
264 // SSE, we're stuck with a fistpll.
265 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
267 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
271 if (!X86ScalarSSEf64) {
272 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
273 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
274 if (Subtarget.is64Bit()) {
275 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
276 // Without SSE, i64->f64 goes through memory.
277 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
279 } else if (!Subtarget.is64Bit())
280 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
282 // Scalar integer divide and remainder are lowered to use operations that
283 // produce two results, to match the available instructions. This exposes
284 // the two-result form to trivial CSE, which is able to combine x/y and x%y
285 // into a single instruction.
287 // Scalar integer multiply-high is also lowered to use two-result
288 // operations, to match the available instructions. However, plain multiply
289 // (low) operations are left as Legal, as there are single-result
290 // instructions for this in x86. Using the two-result multiply instructions
291 // when both high and low results are needed must be arranged by dagcombine.
292 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
293 setOperationAction(ISD::MULHS, VT, Expand);
294 setOperationAction(ISD::MULHU, VT, Expand);
295 setOperationAction(ISD::SDIV, VT, Expand);
296 setOperationAction(ISD::UDIV, VT, Expand);
297 setOperationAction(ISD::SREM, VT, Expand);
298 setOperationAction(ISD::UREM, VT, Expand);
301 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
302 if (VT == MVT::i64 && !Subtarget.is64Bit())
304 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
305 setOperationAction(ISD::ADDC, VT, Custom);
306 setOperationAction(ISD::ADDE, VT, Custom);
307 setOperationAction(ISD::SUBC, VT, Custom);
308 setOperationAction(ISD::SUBE, VT, Custom);
311 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
312 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
313 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
314 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
315 setOperationAction(ISD::BR_CC, VT, Expand);
316 setOperationAction(ISD::SELECT_CC, VT, Expand);
318 if (Subtarget.is64Bit())
319 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
320 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
321 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
322 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
323 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
325 setOperationAction(ISD::FREM , MVT::f32 , Expand);
326 setOperationAction(ISD::FREM , MVT::f64 , Expand);
327 setOperationAction(ISD::FREM , MVT::f80 , Expand);
328 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
330 // Promote the i8 variants and force them on up to i32 which has a shorter
332 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
333 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
334 if (!Subtarget.hasBMI()) {
335 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
336 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
337 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
338 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
339 if (Subtarget.is64Bit()) {
340 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
341 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
345 if (Subtarget.hasLZCNT()) {
346 // When promoting the i8 variants, force them to i32 for a shorter
348 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
349 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
352 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
355 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
356 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
357 if (Subtarget.is64Bit()) {
358 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
359 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
363 // Special handling for half-precision floating point conversions.
364 // If we don't have F16C support, then lower half float conversions
365 // into library calls.
366 if (Subtarget.useSoftFloat() ||
367 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
368 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
369 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
372 // There's never any support for operations beyond MVT::f32.
373 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
374 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
375 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
376 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
378 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
379 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
380 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
381 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
382 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
383 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
385 if (Subtarget.hasPOPCNT()) {
386 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
388 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
389 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
390 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
391 if (Subtarget.is64Bit())
392 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
395 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
397 if (!Subtarget.hasMOVBE())
398 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
400 // These should be promoted to a larger select which is supported.
401 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
402 // X86 wants to expand cmov itself.
403 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
404 setOperationAction(ISD::SELECT, VT, Custom);
405 setOperationAction(ISD::SETCC, VT, Custom);
407 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
408 if (VT == MVT::i64 && !Subtarget.is64Bit())
410 setOperationAction(ISD::SELECT, VT, Custom);
411 setOperationAction(ISD::SETCC, VT, Custom);
412 setOperationAction(ISD::SETCCE, VT, Custom);
414 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
415 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
416 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
417 // support continuation, user-level threading, and etc.. As a result, no
418 // other SjLj exception interfaces are implemented and please don't build
419 // your own exception handling based on them.
420 // LLVM/Clang supports zero-cost DWARF exception handling.
421 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
422 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
423 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
424 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
425 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
428 for (auto VT : { MVT::i32, MVT::i64 }) {
429 if (VT == MVT::i64 && !Subtarget.is64Bit())
431 setOperationAction(ISD::ConstantPool , VT, Custom);
432 setOperationAction(ISD::JumpTable , VT, Custom);
433 setOperationAction(ISD::GlobalAddress , VT, Custom);
434 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
435 setOperationAction(ISD::ExternalSymbol , VT, Custom);
436 setOperationAction(ISD::BlockAddress , VT, Custom);
439 // 64-bit shl, sra, srl (iff 32-bit x86)
440 for (auto VT : { MVT::i32, MVT::i64 }) {
441 if (VT == MVT::i64 && !Subtarget.is64Bit())
443 setOperationAction(ISD::SHL_PARTS, VT, Custom);
444 setOperationAction(ISD::SRA_PARTS, VT, Custom);
445 setOperationAction(ISD::SRL_PARTS, VT, Custom);
448 if (Subtarget.hasSSE1())
449 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
451 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
453 // Expand certain atomics
454 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
455 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
456 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
457 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
458 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
459 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
460 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
461 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
464 if (Subtarget.hasCmpxchg16b()) {
465 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
468 // FIXME - use subtarget debug flags
469 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
470 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
471 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
472 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
475 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
476 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
478 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
479 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
481 setOperationAction(ISD::TRAP, MVT::Other, Legal);
482 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
484 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
485 setOperationAction(ISD::VASTART , MVT::Other, Custom);
486 setOperationAction(ISD::VAEND , MVT::Other, Expand);
487 bool Is64Bit = Subtarget.is64Bit();
488 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
489 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
491 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
492 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
494 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
496 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
497 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
498 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
500 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
501 // f32 and f64 use SSE.
502 // Set up the FP register classes.
503 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
504 : &X86::FR32RegClass);
505 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
506 : &X86::FR64RegClass);
508 for (auto VT : { MVT::f32, MVT::f64 }) {
509 // Use ANDPD to simulate FABS.
510 setOperationAction(ISD::FABS, VT, Custom);
512 // Use XORP to simulate FNEG.
513 setOperationAction(ISD::FNEG, VT, Custom);
515 // Use ANDPD and ORPD to simulate FCOPYSIGN.
516 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
518 // We don't support sin/cos/fmod
519 setOperationAction(ISD::FSIN , VT, Expand);
520 setOperationAction(ISD::FCOS , VT, Expand);
521 setOperationAction(ISD::FSINCOS, VT, Expand);
524 // Lower this to MOVMSK plus an AND.
525 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
526 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
528 // Expand FP immediates into loads from the stack, except for the special
530 addLegalFPImmediate(APFloat(+0.0)); // xorpd
531 addLegalFPImmediate(APFloat(+0.0f)); // xorps
532 } else if (UseX87 && X86ScalarSSEf32) {
533 // Use SSE for f32, x87 for f64.
534 // Set up the FP register classes.
535 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
536 : &X86::FR32RegClass);
537 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
539 // Use ANDPS to simulate FABS.
540 setOperationAction(ISD::FABS , MVT::f32, Custom);
542 // Use XORP to simulate FNEG.
543 setOperationAction(ISD::FNEG , MVT::f32, Custom);
545 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
547 // Use ANDPS and ORPS to simulate FCOPYSIGN.
548 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
549 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
551 // We don't support sin/cos/fmod
552 setOperationAction(ISD::FSIN , MVT::f32, Expand);
553 setOperationAction(ISD::FCOS , MVT::f32, Expand);
554 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
556 // Special cases we handle for FP constants.
557 addLegalFPImmediate(APFloat(+0.0f)); // xorps
558 addLegalFPImmediate(APFloat(+0.0)); // FLD0
559 addLegalFPImmediate(APFloat(+1.0)); // FLD1
560 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
561 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
563 if (!TM.Options.UnsafeFPMath) {
564 setOperationAction(ISD::FSIN , MVT::f64, Expand);
565 setOperationAction(ISD::FCOS , MVT::f64, Expand);
566 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
569 // f32 and f64 in x87.
570 // Set up the FP register classes.
571 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
572 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
574 for (auto VT : { MVT::f32, MVT::f64 }) {
575 setOperationAction(ISD::UNDEF, VT, Expand);
576 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
578 if (!TM.Options.UnsafeFPMath) {
579 setOperationAction(ISD::FSIN , VT, Expand);
580 setOperationAction(ISD::FCOS , VT, Expand);
581 setOperationAction(ISD::FSINCOS, VT, Expand);
584 addLegalFPImmediate(APFloat(+0.0)); // FLD0
585 addLegalFPImmediate(APFloat(+1.0)); // FLD1
586 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
587 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
588 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
589 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
590 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
591 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
594 // We don't support FMA.
595 setOperationAction(ISD::FMA, MVT::f64, Expand);
596 setOperationAction(ISD::FMA, MVT::f32, Expand);
598 // Long double always uses X87, except f128 in MMX.
600 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
601 addRegisterClass(MVT::f128, &X86::FR128RegClass);
602 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
603 setOperationAction(ISD::FABS , MVT::f128, Custom);
604 setOperationAction(ISD::FNEG , MVT::f128, Custom);
605 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
608 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
609 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
610 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
612 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
613 addLegalFPImmediate(TmpFlt); // FLD0
615 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
618 APFloat TmpFlt2(+1.0);
619 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
621 addLegalFPImmediate(TmpFlt2); // FLD1
622 TmpFlt2.changeSign();
623 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
626 if (!TM.Options.UnsafeFPMath) {
627 setOperationAction(ISD::FSIN , MVT::f80, Expand);
628 setOperationAction(ISD::FCOS , MVT::f80, Expand);
629 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
632 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
633 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
634 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
635 setOperationAction(ISD::FRINT, MVT::f80, Expand);
636 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
637 setOperationAction(ISD::FMA, MVT::f80, Expand);
640 // Always use a library call for pow.
641 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
642 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
643 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
645 setOperationAction(ISD::FLOG, MVT::f80, Expand);
646 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
647 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
648 setOperationAction(ISD::FEXP, MVT::f80, Expand);
649 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
650 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
651 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
653 // Some FP actions are always expanded for vector types.
654 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
655 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
656 setOperationAction(ISD::FSIN, VT, Expand);
657 setOperationAction(ISD::FSINCOS, VT, Expand);
658 setOperationAction(ISD::FCOS, VT, Expand);
659 setOperationAction(ISD::FREM, VT, Expand);
660 setOperationAction(ISD::FPOWI, VT, Expand);
661 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
662 setOperationAction(ISD::FPOW, VT, Expand);
663 setOperationAction(ISD::FLOG, VT, Expand);
664 setOperationAction(ISD::FLOG2, VT, Expand);
665 setOperationAction(ISD::FLOG10, VT, Expand);
666 setOperationAction(ISD::FEXP, VT, Expand);
667 setOperationAction(ISD::FEXP2, VT, Expand);
670 // First set operation action for all vector types to either promote
671 // (for widening) or expand (for scalarization). Then we will selectively
672 // turn on ones that can be effectively codegen'd.
673 for (MVT VT : MVT::vector_valuetypes()) {
674 setOperationAction(ISD::SDIV, VT, Expand);
675 setOperationAction(ISD::UDIV, VT, Expand);
676 setOperationAction(ISD::SREM, VT, Expand);
677 setOperationAction(ISD::UREM, VT, Expand);
678 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
679 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
680 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
681 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
682 setOperationAction(ISD::FMA, VT, Expand);
683 setOperationAction(ISD::FFLOOR, VT, Expand);
684 setOperationAction(ISD::FCEIL, VT, Expand);
685 setOperationAction(ISD::FTRUNC, VT, Expand);
686 setOperationAction(ISD::FRINT, VT, Expand);
687 setOperationAction(ISD::FNEARBYINT, VT, Expand);
688 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
689 setOperationAction(ISD::MULHS, VT, Expand);
690 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
691 setOperationAction(ISD::MULHU, VT, Expand);
692 setOperationAction(ISD::SDIVREM, VT, Expand);
693 setOperationAction(ISD::UDIVREM, VT, Expand);
694 setOperationAction(ISD::CTPOP, VT, Expand);
695 setOperationAction(ISD::CTTZ, VT, Expand);
696 setOperationAction(ISD::CTLZ, VT, Expand);
697 setOperationAction(ISD::ROTL, VT, Expand);
698 setOperationAction(ISD::ROTR, VT, Expand);
699 setOperationAction(ISD::BSWAP, VT, Expand);
700 setOperationAction(ISD::SETCC, VT, Expand);
701 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
702 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
703 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
704 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
705 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
706 setOperationAction(ISD::TRUNCATE, VT, Expand);
707 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
708 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
709 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
710 setOperationAction(ISD::SELECT_CC, VT, Expand);
711 for (MVT InnerVT : MVT::vector_valuetypes()) {
712 setTruncStoreAction(InnerVT, VT, Expand);
714 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
715 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
717 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
718 // types, we have to deal with them whether we ask for Expansion or not.
719 // Setting Expand causes its own optimisation problems though, so leave
721 if (VT.getVectorElementType() == MVT::i1)
722 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
724 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
725 // split/scalarized right now.
726 if (VT.getVectorElementType() == MVT::f16)
727 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
731 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
732 // with -msoft-float, disable use of MMX as well.
733 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
734 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
735 // No operations on x86mmx supported, everything uses intrinsics.
738 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
739 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
740 : &X86::VR128RegClass);
742 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
743 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
744 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
745 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
746 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
747 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
748 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
749 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
750 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
753 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
754 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
757 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
758 // registers cannot be used even for integer operations.
759 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
760 : &X86::VR128RegClass);
761 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
762 : &X86::VR128RegClass);
763 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
764 : &X86::VR128RegClass);
765 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
769 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
770 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
771 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
772 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
773 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
774 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
775 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
776 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
777 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
778 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
779 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
780 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
782 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
783 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
784 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
785 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
787 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
788 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
789 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
790 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
792 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
793 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
794 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
795 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
796 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
797 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
799 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
800 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
801 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
802 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
804 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
805 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
806 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
807 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
809 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
810 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
811 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
812 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
813 setOperationAction(ISD::VSELECT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
817 // We support custom legalizing of sext and anyext loads for specific
818 // memory vector types which we can load as a scalar (or sequence of
819 // scalars) and extend in-register to a legal 128-bit vector type. For sext
820 // loads these must work with a single scalar load.
821 for (MVT VT : MVT::integer_vector_valuetypes()) {
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
824 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
833 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
834 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
835 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
836 setOperationAction(ISD::VSELECT, VT, Custom);
838 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
841 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
845 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
846 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
847 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
851 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
854 // Custom lower v2i64 and v2f64 selects.
855 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
856 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
859 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
862 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
868 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
869 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
871 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
874 for (MVT VT : MVT::fp_vector_valuetypes())
875 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
877 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
878 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
879 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
881 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
882 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
883 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
885 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
886 setOperationAction(ISD::SRL, VT, Custom);
887 setOperationAction(ISD::SHL, VT, Custom);
888 setOperationAction(ISD::SRA, VT, Custom);
891 // In the customized shift lowering, the legal cases in AVX2 will be
893 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
894 setOperationAction(ISD::SRL, VT, Custom);
895 setOperationAction(ISD::SHL, VT, Custom);
896 setOperationAction(ISD::SRA, VT, Custom);
900 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
901 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
902 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
903 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
904 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
905 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
906 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
907 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
908 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
911 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
912 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
913 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
914 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
915 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
916 setOperationAction(ISD::FRINT, RoundedTy, Legal);
917 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
920 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
921 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
922 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
923 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
924 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
925 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
926 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
927 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
929 // FIXME: Do we need to handle scalar-to-vector here?
930 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
932 // We directly match byte blends in the backend as they match the VSELECT
934 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
936 // SSE41 brings specific instructions for doing vector sign extend even in
937 // cases where we don't have SRA.
938 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
939 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
940 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
942 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
943 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
944 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
946 for (MVT VT : MVT::integer_vector_valuetypes()) {
947 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
948 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
949 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
952 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
953 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
954 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
955 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
956 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
957 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
958 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
960 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
961 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
962 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
963 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
964 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
965 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
967 // i8 vectors are custom because the source register and source
968 // source memory operand types are not the same width.
969 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
972 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
973 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
974 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
975 setOperationAction(ISD::ROTL, VT, Custom);
977 // XOP can efficiently perform BITREVERSE with VPPERM.
978 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
979 setOperationAction(ISD::BITREVERSE, VT, Custom);
981 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
982 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
983 setOperationAction(ISD::BITREVERSE, VT, Custom);
986 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
987 bool HasInt256 = Subtarget.hasInt256();
989 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
990 : &X86::VR256RegClass);
991 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
992 : &X86::VR256RegClass);
993 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
994 : &X86::VR256RegClass);
995 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1002 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1003 setOperationAction(ISD::FFLOOR, VT, Legal);
1004 setOperationAction(ISD::FCEIL, VT, Legal);
1005 setOperationAction(ISD::FTRUNC, VT, Legal);
1006 setOperationAction(ISD::FRINT, VT, Legal);
1007 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1008 setOperationAction(ISD::FNEG, VT, Custom);
1009 setOperationAction(ISD::FABS, VT, Custom);
1010 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1013 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1014 // even though v8i16 is a legal type.
1015 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1016 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1017 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1019 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1020 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1021 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1023 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1024 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1026 for (MVT VT : MVT::fp_vector_valuetypes())
1027 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1029 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1030 setOperationAction(ISD::SRL, VT, Custom);
1031 setOperationAction(ISD::SHL, VT, Custom);
1032 setOperationAction(ISD::SRA, VT, Custom);
1035 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1036 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1037 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1038 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1040 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1041 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1042 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1044 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1045 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1046 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1047 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1048 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1049 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1050 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1051 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1052 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1053 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1054 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1055 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1056 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1058 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1059 setOperationAction(ISD::CTPOP, VT, Custom);
1060 setOperationAction(ISD::CTTZ, VT, Custom);
1061 setOperationAction(ISD::CTLZ, VT, Custom);
1064 if (Subtarget.hasAnyFMA()) {
1065 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1066 MVT::v2f64, MVT::v4f64 })
1067 setOperationAction(ISD::FMA, VT, Legal);
1070 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1071 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1076 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1077 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1078 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1080 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1081 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1083 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1085 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1086 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1088 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1089 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1090 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1091 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1092 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1097 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1098 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1099 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1101 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1102 // when we have a 256bit-wide blend with immediate.
1103 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1105 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1106 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1107 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1108 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1109 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1110 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1111 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1113 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1114 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1115 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1116 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1117 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1118 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1121 // In the customized shift lowering, the legal cases in AVX2 will be
1123 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1124 setOperationAction(ISD::SRL, VT, Custom);
1125 setOperationAction(ISD::SHL, VT, Custom);
1126 setOperationAction(ISD::SRA, VT, Custom);
1129 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1130 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1131 setOperationAction(ISD::MLOAD, VT, Legal);
1132 setOperationAction(ISD::MSTORE, VT, Legal);
1135 // Extract subvector is special because the value type
1136 // (result) is 128-bit but the source is 256-bit wide.
1137 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1138 MVT::v4f32, MVT::v2f64 }) {
1139 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1142 // Custom lower several nodes for 256-bit types.
1143 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1144 MVT::v8f32, MVT::v4f64 }) {
1145 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1146 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1147 setOperationAction(ISD::VSELECT, VT, Custom);
1148 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1149 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1151 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1152 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1156 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1158 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1159 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1160 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1161 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1168 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1169 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1170 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1171 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1172 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1174 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1175 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1176 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1178 for (MVT VT : MVT::fp_vector_valuetypes())
1179 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1181 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1182 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1183 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1184 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1185 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1186 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1187 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1189 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1190 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1191 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1192 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1193 setOperationAction(ISD::XOR, MVT::i1, Legal);
1194 setOperationAction(ISD::OR, MVT::i1, Legal);
1195 setOperationAction(ISD::AND, MVT::i1, Legal);
1196 setOperationAction(ISD::SUB, MVT::i1, Custom);
1197 setOperationAction(ISD::ADD, MVT::i1, Custom);
1198 setOperationAction(ISD::MUL, MVT::i1, Custom);
1200 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1201 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1202 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1203 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1204 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1205 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1206 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1207 setTruncStoreAction(VT, MaskVT, Custom);
1210 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1211 setOperationAction(ISD::FNEG, VT, Custom);
1212 setOperationAction(ISD::FABS, VT, Custom);
1213 setOperationAction(ISD::FMA, VT, Legal);
1214 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1217 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1218 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1219 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1220 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1221 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1222 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1223 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1224 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1225 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1226 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1227 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1228 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1229 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1230 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1231 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1232 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1233 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1234 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1235 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1236 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1237 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1238 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1239 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1240 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1241 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1243 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1244 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1245 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1246 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1247 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1248 if (Subtarget.hasVLX()){
1249 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1250 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1251 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1252 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1253 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1255 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1256 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1257 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1258 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1259 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1261 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1262 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1263 setOperationAction(ISD::MLOAD, VT, Custom);
1264 setOperationAction(ISD::MSTORE, VT, Custom);
1267 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1268 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1269 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1270 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1271 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1272 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1273 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1274 if (Subtarget.hasDQI()) {
1275 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1276 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1277 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1278 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1279 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1280 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1281 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1282 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1283 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1284 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1285 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1286 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1288 if (Subtarget.hasVLX()) {
1289 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1290 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1291 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1292 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1295 if (Subtarget.hasVLX()) {
1296 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1297 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1298 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1299 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1300 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1301 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1302 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1303 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1304 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1305 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1306 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1310 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1311 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1312 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1313 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1314 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1315 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1316 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1317 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1318 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1319 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1320 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1323 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1324 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1325 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1327 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1328 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1329 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1330 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1331 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1332 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1333 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1334 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1336 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1337 setOperationAction(ISD::FFLOOR, VT, Legal);
1338 setOperationAction(ISD::FCEIL, VT, Legal);
1339 setOperationAction(ISD::FTRUNC, VT, Legal);
1340 setOperationAction(ISD::FRINT, VT, Legal);
1341 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1344 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1345 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1347 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1348 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1349 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1351 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1352 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1353 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1354 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1355 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1357 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1358 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1360 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1362 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1363 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1364 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1365 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1366 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1367 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1368 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1369 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1370 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1371 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1372 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1373 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1375 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1376 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1377 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1378 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1379 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1380 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1381 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1382 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1384 setOperationAction(ISD::ADD, MVT::v8i1, Custom);
1385 setOperationAction(ISD::ADD, MVT::v16i1, Custom);
1386 setOperationAction(ISD::SUB, MVT::v8i1, Custom);
1387 setOperationAction(ISD::SUB, MVT::v16i1, Custom);
1388 setOperationAction(ISD::MUL, MVT::v8i1, Custom);
1389 setOperationAction(ISD::MUL, MVT::v16i1, Custom);
1391 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1393 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1394 setOperationAction(ISD::ABS, VT, Legal);
1395 setOperationAction(ISD::SRL, VT, Custom);
1396 setOperationAction(ISD::SHL, VT, Custom);
1397 setOperationAction(ISD::SRA, VT, Custom);
1398 setOperationAction(ISD::CTPOP, VT, Custom);
1399 setOperationAction(ISD::CTTZ, VT, Custom);
1402 // Need to promote to 64-bit even though we have 32-bit masked instructions
1403 // because the IR optimizers rearrange bitcasts around logic ops leaving
1404 // too many variations to handle if we don't promote them.
1405 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1406 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1407 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1409 if (Subtarget.hasCDI()) {
1410 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1411 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1413 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1414 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1415 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1416 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1418 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1419 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1421 if (Subtarget.hasVLX()) {
1422 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1423 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1424 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1425 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1427 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1428 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1429 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1430 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1433 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1434 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1435 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1436 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1437 } // Subtarget.hasCDI()
1439 if (Subtarget.hasDQI()) {
1440 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1441 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1442 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1443 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1446 // Custom lower several nodes.
1447 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1448 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1449 setOperationAction(ISD::MGATHER, VT, Custom);
1450 setOperationAction(ISD::MSCATTER, VT, Custom);
1452 // Extract subvector is special because the value type
1453 // (result) is 256-bit but the source is 512-bit wide.
1454 // 128-bit was made Custom under AVX1.
1455 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1456 MVT::v8f32, MVT::v4f64 })
1457 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1458 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1459 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1460 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1462 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1463 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1464 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1465 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1466 setOperationAction(ISD::VSELECT, VT, Legal);
1467 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1468 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1469 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1470 setOperationAction(ISD::MLOAD, VT, Legal);
1471 setOperationAction(ISD::MSTORE, VT, Legal);
1472 setOperationAction(ISD::MGATHER, VT, Legal);
1473 setOperationAction(ISD::MSCATTER, VT, Custom);
1475 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1476 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1477 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1481 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1482 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1483 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1489 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1490 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1491 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1492 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1493 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1495 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1496 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1497 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1498 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1499 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1500 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1501 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1508 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1509 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1510 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1511 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1512 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1513 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1514 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1515 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1516 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1517 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1518 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1519 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1520 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1521 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1522 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1523 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1527 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1528 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1529 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1530 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1531 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1532 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1533 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1534 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1535 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1536 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1537 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1538 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1539 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1540 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1541 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1543 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1544 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1545 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1546 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1547 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1548 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1549 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1550 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1555 if (Subtarget.hasVLX()) {
1556 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1557 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1560 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1561 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1562 setOperationAction(ISD::MLOAD, VT, Action);
1563 setOperationAction(ISD::MSTORE, VT, Action);
1566 if (Subtarget.hasCDI()) {
1567 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1568 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1571 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1572 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1573 setOperationAction(ISD::VSELECT, VT, Legal);
1574 setOperationAction(ISD::ABS, VT, Legal);
1575 setOperationAction(ISD::SRL, VT, Custom);
1576 setOperationAction(ISD::SHL, VT, Custom);
1577 setOperationAction(ISD::SRA, VT, Custom);
1578 setOperationAction(ISD::MLOAD, VT, Legal);
1579 setOperationAction(ISD::MSTORE, VT, Legal);
1580 setOperationAction(ISD::CTPOP, VT, Custom);
1581 setOperationAction(ISD::CTTZ, VT, Custom);
1583 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1584 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1585 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1588 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1589 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1590 if (Subtarget.hasVLX()) {
1591 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1592 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1593 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1598 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1599 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1600 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1602 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1603 setOperationAction(ISD::ADD, VT, Custom);
1604 setOperationAction(ISD::SUB, VT, Custom);
1605 setOperationAction(ISD::MUL, VT, Custom);
1606 setOperationAction(ISD::VSELECT, VT, Expand);
1608 setOperationAction(ISD::TRUNCATE, VT, Custom);
1609 setOperationAction(ISD::SETCC, VT, Custom);
1610 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1611 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1612 setOperationAction(ISD::SELECT, VT, Custom);
1613 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1614 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1617 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1618 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1619 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1620 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1622 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1623 setOperationAction(ISD::SMAX, VT, Legal);
1624 setOperationAction(ISD::UMAX, VT, Legal);
1625 setOperationAction(ISD::SMIN, VT, Legal);
1626 setOperationAction(ISD::UMIN, VT, Legal);
1630 // We want to custom lower some of our intrinsics.
1631 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1632 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1633 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1634 if (!Subtarget.is64Bit()) {
1635 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1639 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1640 // handle type legalization for these operations here.
1642 // FIXME: We really should do custom legalization for addition and
1643 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1644 // than generic legalization for 64-bit multiplication-with-overflow, though.
1645 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1646 if (VT == MVT::i64 && !Subtarget.is64Bit())
1648 // Add/Sub/Mul with overflow operations are custom lowered.
1649 setOperationAction(ISD::SADDO, VT, Custom);
1650 setOperationAction(ISD::UADDO, VT, Custom);
1651 setOperationAction(ISD::SSUBO, VT, Custom);
1652 setOperationAction(ISD::USUBO, VT, Custom);
1653 setOperationAction(ISD::SMULO, VT, Custom);
1654 setOperationAction(ISD::UMULO, VT, Custom);
1657 if (!Subtarget.is64Bit()) {
1658 // These libcalls are not available in 32-bit.
1659 setLibcallName(RTLIB::SHL_I128, nullptr);
1660 setLibcallName(RTLIB::SRL_I128, nullptr);
1661 setLibcallName(RTLIB::SRA_I128, nullptr);
1664 // Combine sin / cos into one node or libcall if possible.
1665 if (Subtarget.hasSinCos()) {
1666 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1667 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1668 if (Subtarget.isTargetDarwin()) {
1669 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1670 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1671 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1672 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1676 if (Subtarget.isTargetWin64()) {
1677 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1678 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1679 setOperationAction(ISD::SREM, MVT::i128, Custom);
1680 setOperationAction(ISD::UREM, MVT::i128, Custom);
1681 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1682 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1685 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1686 // is. We should promote the value to 64-bits to solve this.
1687 // This is what the CRT headers do - `fmodf` is an inline header
1688 // function casting to f64 and calling `fmod`.
1689 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1690 Subtarget.isTargetWindowsItanium()))
1691 for (ISD::NodeType Op :
1692 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1693 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1694 if (isOperationExpand(Op, MVT::f32))
1695 setOperationAction(Op, MVT::f32, Promote);
1697 // We have target-specific dag combine patterns for the following nodes:
1698 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1699 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1700 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1701 setTargetDAGCombine(ISD::BITCAST);
1702 setTargetDAGCombine(ISD::VSELECT);
1703 setTargetDAGCombine(ISD::SELECT);
1704 setTargetDAGCombine(ISD::SHL);
1705 setTargetDAGCombine(ISD::SRA);
1706 setTargetDAGCombine(ISD::SRL);
1707 setTargetDAGCombine(ISD::OR);
1708 setTargetDAGCombine(ISD::AND);
1709 setTargetDAGCombine(ISD::ADD);
1710 setTargetDAGCombine(ISD::FADD);
1711 setTargetDAGCombine(ISD::FSUB);
1712 setTargetDAGCombine(ISD::FNEG);
1713 setTargetDAGCombine(ISD::FMA);
1714 setTargetDAGCombine(ISD::FMINNUM);
1715 setTargetDAGCombine(ISD::FMAXNUM);
1716 setTargetDAGCombine(ISD::SUB);
1717 setTargetDAGCombine(ISD::LOAD);
1718 setTargetDAGCombine(ISD::MLOAD);
1719 setTargetDAGCombine(ISD::STORE);
1720 setTargetDAGCombine(ISD::MSTORE);
1721 setTargetDAGCombine(ISD::TRUNCATE);
1722 setTargetDAGCombine(ISD::ZERO_EXTEND);
1723 setTargetDAGCombine(ISD::ANY_EXTEND);
1724 setTargetDAGCombine(ISD::SIGN_EXTEND);
1725 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1726 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1727 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1728 setTargetDAGCombine(ISD::SINT_TO_FP);
1729 setTargetDAGCombine(ISD::UINT_TO_FP);
1730 setTargetDAGCombine(ISD::SETCC);
1731 setTargetDAGCombine(ISD::MUL);
1732 setTargetDAGCombine(ISD::XOR);
1733 setTargetDAGCombine(ISD::MSCATTER);
1734 setTargetDAGCombine(ISD::MGATHER);
1736 computeRegisterProperties(Subtarget.getRegisterInfo());
1738 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1739 MaxStoresPerMemsetOptSize = 8;
1740 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1741 MaxStoresPerMemcpyOptSize = 4;
1742 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1743 MaxStoresPerMemmoveOptSize = 4;
1744 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1745 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1747 // An out-of-order CPU can speculatively execute past a predictable branch,
1748 // but a conditional move could be stalled by an expensive earlier operation.
1749 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1750 EnableExtLdPromotion = true;
1751 setPrefFunctionAlignment(4); // 2^4 bytes.
1753 verifyIntrinsicTables();
1756 // This has so far only been implemented for 64-bit MachO.
1757 bool X86TargetLowering::useLoadStackGuardNode() const {
1758 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1761 TargetLoweringBase::LegalizeTypeAction
1762 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1763 if (ExperimentalVectorWideningLegalization &&
1764 VT.getVectorNumElements() != 1 &&
1765 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1766 return TypeWidenVector;
1768 return TargetLoweringBase::getPreferredVectorAction(VT);
1771 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1772 LLVMContext& Context,
1775 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1777 if (VT.isSimple()) {
1778 MVT VVT = VT.getSimpleVT();
1779 const unsigned NumElts = VVT.getVectorNumElements();
1780 MVT EltVT = VVT.getVectorElementType();
1781 if (VVT.is512BitVector()) {
1782 if (Subtarget.hasAVX512())
1783 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1784 EltVT == MVT::f32 || EltVT == MVT::f64)
1786 case 8: return MVT::v8i1;
1787 case 16: return MVT::v16i1;
1789 if (Subtarget.hasBWI())
1790 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1792 case 32: return MVT::v32i1;
1793 case 64: return MVT::v64i1;
1797 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1798 return MVT::getVectorVT(MVT::i1, NumElts);
1800 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1801 EVT LegalVT = getTypeToTransformTo(Context, VT);
1802 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1805 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1807 case 2: return MVT::v2i1;
1808 case 4: return MVT::v4i1;
1809 case 8: return MVT::v8i1;
1813 return VT.changeVectorElementTypeToInteger();
1816 /// Helper for getByValTypeAlignment to determine
1817 /// the desired ByVal argument alignment.
1818 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1821 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1822 if (VTy->getBitWidth() == 128)
1824 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1825 unsigned EltAlign = 0;
1826 getMaxByValAlign(ATy->getElementType(), EltAlign);
1827 if (EltAlign > MaxAlign)
1828 MaxAlign = EltAlign;
1829 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1830 for (auto *EltTy : STy->elements()) {
1831 unsigned EltAlign = 0;
1832 getMaxByValAlign(EltTy, EltAlign);
1833 if (EltAlign > MaxAlign)
1834 MaxAlign = EltAlign;
1841 /// Return the desired alignment for ByVal aggregate
1842 /// function arguments in the caller parameter area. For X86, aggregates
1843 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1844 /// are at 4-byte boundaries.
1845 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1846 const DataLayout &DL) const {
1847 if (Subtarget.is64Bit()) {
1848 // Max of 8 and alignment of type.
1849 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1856 if (Subtarget.hasSSE1())
1857 getMaxByValAlign(Ty, Align);
1861 /// Returns the target specific optimal type for load
1862 /// and store operations as a result of memset, memcpy, and memmove
1863 /// lowering. If DstAlign is zero that means it's safe to destination
1864 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1865 /// means there isn't a need to check it against alignment requirement,
1866 /// probably because the source does not need to be loaded. If 'IsMemset' is
1867 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1868 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1869 /// source is constant so it does not need to be loaded.
1870 /// It returns EVT::Other if the type should be determined using generic
1871 /// target-independent logic.
1873 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1874 unsigned DstAlign, unsigned SrcAlign,
1875 bool IsMemset, bool ZeroMemset,
1877 MachineFunction &MF) const {
1878 const Function *F = MF.getFunction();
1879 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1881 (!Subtarget.isUnalignedMem16Slow() ||
1882 ((DstAlign == 0 || DstAlign >= 16) &&
1883 (SrcAlign == 0 || SrcAlign >= 16)))) {
1884 // FIXME: Check if unaligned 32-byte accesses are slow.
1885 if (Size >= 32 && Subtarget.hasAVX()) {
1886 // Although this isn't a well-supported type for AVX1, we'll let
1887 // legalization and shuffle lowering produce the optimal codegen. If we
1888 // choose an optimal type with a vector element larger than a byte,
1889 // getMemsetStores() may create an intermediate splat (using an integer
1890 // multiply) before we splat as a vector.
1893 if (Subtarget.hasSSE2())
1895 // TODO: Can SSE1 handle a byte vector?
1896 if (Subtarget.hasSSE1())
1898 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1899 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1900 // Do not use f64 to lower memcpy if source is string constant. It's
1901 // better to use i32 to avoid the loads.
1902 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1903 // The gymnastics of splatting a byte value into an XMM register and then
1904 // only using 8-byte stores (because this is a CPU with slow unaligned
1905 // 16-byte accesses) makes that a loser.
1909 // This is a compromise. If we reach here, unaligned accesses may be slow on
1910 // this target. However, creating smaller, aligned accesses could be even
1911 // slower and would certainly be a lot more code.
1912 if (Subtarget.is64Bit() && Size >= 8)
1917 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1919 return X86ScalarSSEf32;
1920 else if (VT == MVT::f64)
1921 return X86ScalarSSEf64;
1926 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1931 switch (VT.getSizeInBits()) {
1933 // 8-byte and under are always assumed to be fast.
1937 *Fast = !Subtarget.isUnalignedMem16Slow();
1940 *Fast = !Subtarget.isUnalignedMem32Slow();
1942 // TODO: What about AVX-512 (512-bit) accesses?
1945 // Misaligned accesses of any size are always allowed.
1949 /// Return the entry encoding for a jump table in the
1950 /// current function. The returned value is a member of the
1951 /// MachineJumpTableInfo::JTEntryKind enum.
1952 unsigned X86TargetLowering::getJumpTableEncoding() const {
1953 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1955 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1956 return MachineJumpTableInfo::EK_Custom32;
1958 // Otherwise, use the normal jump table encoding heuristics.
1959 return TargetLowering::getJumpTableEncoding();
1962 bool X86TargetLowering::useSoftFloat() const {
1963 return Subtarget.useSoftFloat();
1966 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1967 ArgListTy &Args) const {
1969 // Only relabel X86-32 for C / Stdcall CCs.
1970 if (Subtarget.is64Bit())
1972 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1974 unsigned ParamRegs = 0;
1975 if (auto *M = MF->getFunction()->getParent())
1976 ParamRegs = M->getNumberRegisterParameters();
1978 // Mark the first N int arguments as having reg
1979 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1980 Type *T = Args[Idx].Ty;
1981 if (T->isPointerTy() || T->isIntegerTy())
1982 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1983 unsigned numRegs = 1;
1984 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1986 if (ParamRegs < numRegs)
1988 ParamRegs -= numRegs;
1989 Args[Idx].IsInReg = true;
1995 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1996 const MachineBasicBlock *MBB,
1997 unsigned uid,MCContext &Ctx) const{
1998 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1999 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2001 return MCSymbolRefExpr::create(MBB->getSymbol(),
2002 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2005 /// Returns relocation base for the given PIC jumptable.
2006 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2007 SelectionDAG &DAG) const {
2008 if (!Subtarget.is64Bit())
2009 // This doesn't have SDLoc associated with it, but is not really the
2010 // same as a Register.
2011 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2012 getPointerTy(DAG.getDataLayout()));
2016 /// This returns the relocation base for the given PIC jumptable,
2017 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2018 const MCExpr *X86TargetLowering::
2019 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2020 MCContext &Ctx) const {
2021 // X86-64 uses RIP relative addressing based on the jump table label.
2022 if (Subtarget.isPICStyleRIPRel())
2023 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2025 // Otherwise, the reference is relative to the PIC base.
2026 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2029 std::pair<const TargetRegisterClass *, uint8_t>
2030 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2032 const TargetRegisterClass *RRC = nullptr;
2034 switch (VT.SimpleTy) {
2036 return TargetLowering::findRepresentativeClass(TRI, VT);
2037 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2038 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2041 RRC = &X86::VR64RegClass;
2043 case MVT::f32: case MVT::f64:
2044 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2045 case MVT::v4f32: case MVT::v2f64:
2046 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2047 case MVT::v8f32: case MVT::v4f64:
2048 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2049 case MVT::v16f32: case MVT::v8f64:
2050 RRC = &X86::VR128XRegClass;
2053 return std::make_pair(RRC, Cost);
2056 unsigned X86TargetLowering::getAddressSpace() const {
2057 if (Subtarget.is64Bit())
2058 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2062 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2063 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2064 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2067 static Constant* SegmentOffset(IRBuilder<> &IRB,
2068 unsigned Offset, unsigned AddressSpace) {
2069 return ConstantExpr::getIntToPtr(
2070 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2071 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2074 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2075 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2076 // tcbhead_t; use it instead of the usual global variable (see
2077 // sysdeps/{i386,x86_64}/nptl/tls.h)
2078 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2079 if (Subtarget.isTargetFuchsia()) {
2080 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2081 return SegmentOffset(IRB, 0x10, getAddressSpace());
2083 // %fs:0x28, unless we're using a Kernel code model, in which case
2084 // it's %gs:0x28. gs:0x14 on i386.
2085 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2086 return SegmentOffset(IRB, Offset, getAddressSpace());
2090 return TargetLowering::getIRStackGuard(IRB);
2093 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2094 // MSVC CRT provides functionalities for stack protection.
2095 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2096 // MSVC CRT has a global variable holding security cookie.
2097 M.getOrInsertGlobal("__security_cookie",
2098 Type::getInt8PtrTy(M.getContext()));
2100 // MSVC CRT has a function to validate security cookie.
2101 auto *SecurityCheckCookie = cast<Function>(
2102 M.getOrInsertFunction("__security_check_cookie",
2103 Type::getVoidTy(M.getContext()),
2104 Type::getInt8PtrTy(M.getContext())));
2105 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2106 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2109 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2110 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2112 TargetLowering::insertSSPDeclarations(M);
2115 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2116 // MSVC CRT has a global variable holding security cookie.
2117 if (Subtarget.getTargetTriple().isOSMSVCRT())
2118 return M.getGlobalVariable("__security_cookie");
2119 return TargetLowering::getSDagStackGuard(M);
2122 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2123 // MSVC CRT has a function to validate security cookie.
2124 if (Subtarget.getTargetTriple().isOSMSVCRT())
2125 return M.getFunction("__security_check_cookie");
2126 return TargetLowering::getSSPStackGuardCheck(M);
2129 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2130 if (Subtarget.getTargetTriple().isOSContiki())
2131 return getDefaultSafeStackPointerLocation(IRB, false);
2133 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2134 // definition of TLS_SLOT_SAFESTACK in
2135 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2136 if (Subtarget.isTargetAndroid()) {
2137 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2139 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2140 return SegmentOffset(IRB, Offset, getAddressSpace());
2143 // Fuchsia is similar.
2144 if (Subtarget.isTargetFuchsia()) {
2145 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2146 return SegmentOffset(IRB, 0x18, getAddressSpace());
2149 return TargetLowering::getSafeStackPointerLocation(IRB);
2152 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2153 unsigned DestAS) const {
2154 assert(SrcAS != DestAS && "Expected different address spaces!");
2156 return SrcAS < 256 && DestAS < 256;
2159 //===----------------------------------------------------------------------===//
2160 // Return Value Calling Convention Implementation
2161 //===----------------------------------------------------------------------===//
2163 #include "X86GenCallingConv.inc"
2165 bool X86TargetLowering::CanLowerReturn(
2166 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2167 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2168 SmallVector<CCValAssign, 16> RVLocs;
2169 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2170 return CCInfo.CheckReturn(Outs, RetCC_X86);
2173 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2174 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2178 /// Lowers masks values (v*i1) to the local register values
2179 /// \returns DAG node after lowering to register type
2180 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2181 const SDLoc &Dl, SelectionDAG &DAG) {
2182 EVT ValVT = ValArg.getValueType();
2184 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2185 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2186 // Two stage lowering might be required
2187 // bitcast: v8i1 -> i8 / v16i1 -> i16
2188 // anyextend: i8 -> i32 / i16 -> i32
2189 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2190 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2191 if (ValLoc == MVT::i32)
2192 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2194 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2195 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2196 // One stage lowering is required
2197 // bitcast: v32i1 -> i32 / v64i1 -> i64
2198 return DAG.getBitcast(ValLoc, ValArg);
2200 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2203 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2204 static void Passv64i1ArgInRegs(
2205 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2206 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2207 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2208 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2209 "Expected AVX512BW or AVX512BMI target!");
2210 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2211 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2212 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2213 "The value should reside in two registers");
2215 // Before splitting the value we cast it to i64
2216 Arg = DAG.getBitcast(MVT::i64, Arg);
2218 // Splitting the value into two i32 types
2220 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2221 DAG.getConstant(0, Dl, MVT::i32));
2222 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2223 DAG.getConstant(1, Dl, MVT::i32));
2225 // Attach the two i32 types into corresponding registers
2226 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2227 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2231 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2233 const SmallVectorImpl<ISD::OutputArg> &Outs,
2234 const SmallVectorImpl<SDValue> &OutVals,
2235 const SDLoc &dl, SelectionDAG &DAG) const {
2236 MachineFunction &MF = DAG.getMachineFunction();
2237 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2239 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2240 report_fatal_error("X86 interrupts may not return any value");
2242 SmallVector<CCValAssign, 16> RVLocs;
2243 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2244 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2247 SmallVector<SDValue, 6> RetOps;
2248 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2249 // Operand #1 = Bytes To Pop
2250 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2253 // Copy the result values into the output registers.
2254 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2256 CCValAssign &VA = RVLocs[I];
2257 assert(VA.isRegLoc() && "Can only return in registers!");
2259 // Add the register to the CalleeSaveDisableRegs list.
2260 if (CallConv == CallingConv::X86_RegCall)
2261 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2263 SDValue ValToCopy = OutVals[OutsIndex];
2264 EVT ValVT = ValToCopy.getValueType();
2266 // Promote values to the appropriate types.
2267 if (VA.getLocInfo() == CCValAssign::SExt)
2268 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2269 else if (VA.getLocInfo() == CCValAssign::ZExt)
2270 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2271 else if (VA.getLocInfo() == CCValAssign::AExt) {
2272 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2273 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2275 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2277 else if (VA.getLocInfo() == CCValAssign::BCvt)
2278 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2280 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2281 "Unexpected FP-extend for return value.");
2283 // If this is x86-64, and we disabled SSE, we can't return FP values,
2284 // or SSE or MMX vectors.
2285 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2286 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2287 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2288 report_fatal_error("SSE register return with SSE disabled");
2290 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2291 // llvm-gcc has never done it right and no one has noticed, so this
2292 // should be OK for now.
2293 if (ValVT == MVT::f64 &&
2294 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2295 report_fatal_error("SSE2 register return with SSE2 disabled");
2297 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2298 // the RET instruction and handled by the FP Stackifier.
2299 if (VA.getLocReg() == X86::FP0 ||
2300 VA.getLocReg() == X86::FP1) {
2301 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2302 // change the value to the FP stack register class.
2303 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2304 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2305 RetOps.push_back(ValToCopy);
2306 // Don't emit a copytoreg.
2310 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2311 // which is returned in RAX / RDX.
2312 if (Subtarget.is64Bit()) {
2313 if (ValVT == MVT::x86mmx) {
2314 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2315 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2316 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2318 // If we don't have SSE2 available, convert to v4f32 so the generated
2319 // register is legal.
2320 if (!Subtarget.hasSSE2())
2321 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2326 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2328 if (VA.needsCustom()) {
2329 assert(VA.getValVT() == MVT::v64i1 &&
2330 "Currently the only custom case is when we split v64i1 to 2 regs");
2332 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2335 assert(2 == RegsToPass.size() &&
2336 "Expecting two registers after Pass64BitArgInRegs");
2338 // Add the second register to the CalleeSaveDisableRegs list.
2339 if (CallConv == CallingConv::X86_RegCall)
2340 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2342 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2345 // Add nodes to the DAG and add the values into the RetOps list
2346 for (auto &Reg : RegsToPass) {
2347 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2348 Flag = Chain.getValue(1);
2349 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2353 // Swift calling convention does not require we copy the sret argument
2354 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2356 // All x86 ABIs require that for returning structs by value we copy
2357 // the sret argument into %rax/%eax (depending on ABI) for the return.
2358 // We saved the argument into a virtual register in the entry block,
2359 // so now we copy the value out and into %rax/%eax.
2361 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2362 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2363 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2364 // either case FuncInfo->setSRetReturnReg() will have been called.
2365 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2366 // When we have both sret and another return value, we should use the
2367 // original Chain stored in RetOps[0], instead of the current Chain updated
2368 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2370 // For the case of sret and another return value, we have
2371 // Chain_0 at the function entry
2372 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2373 // If we use Chain_1 in getCopyFromReg, we will have
2374 // Val = getCopyFromReg(Chain_1)
2375 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2377 // getCopyToReg(Chain_0) will be glued together with
2378 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2379 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2380 // Data dependency from Unit B to Unit A due to usage of Val in
2381 // getCopyToReg(Chain_1, Val)
2382 // Chain dependency from Unit A to Unit B
2384 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2385 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2386 getPointerTy(MF.getDataLayout()));
2389 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2390 X86::RAX : X86::EAX;
2391 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2392 Flag = Chain.getValue(1);
2394 // RAX/EAX now acts like a return value.
2396 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2398 // Add the returned register to the CalleeSaveDisableRegs list.
2399 if (CallConv == CallingConv::X86_RegCall)
2400 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2403 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2404 const MCPhysReg *I =
2405 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2408 if (X86::GR64RegClass.contains(*I))
2409 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2411 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2415 RetOps[0] = Chain; // Update chain.
2417 // Add the flag if we have it.
2419 RetOps.push_back(Flag);
2421 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2422 if (CallConv == CallingConv::X86_INTR)
2423 opcode = X86ISD::IRET;
2424 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2427 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2428 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2431 SDValue TCChain = Chain;
2432 SDNode *Copy = *N->use_begin();
2433 if (Copy->getOpcode() == ISD::CopyToReg) {
2434 // If the copy has a glue operand, we conservatively assume it isn't safe to
2435 // perform a tail call.
2436 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2438 TCChain = Copy->getOperand(0);
2439 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2442 bool HasRet = false;
2443 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2445 if (UI->getOpcode() != X86ISD::RET_FLAG)
2447 // If we are returning more than one value, we can definitely
2448 // not make a tail call see PR19530
2449 if (UI->getNumOperands() > 4)
2451 if (UI->getNumOperands() == 4 &&
2452 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2464 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2465 ISD::NodeType ExtendKind) const {
2466 MVT ReturnMVT = MVT::i32;
2468 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2469 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2470 // The ABI does not require i1, i8 or i16 to be extended.
2472 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2473 // always extending i8/i16 return values, so keep doing that for now.
2475 ReturnMVT = MVT::i8;
2478 EVT MinVT = getRegisterType(Context, ReturnMVT);
2479 return VT.bitsLT(MinVT) ? MinVT : VT;
2482 /// Reads two 32 bit registers and creates a 64 bit mask value.
2483 /// \param VA The current 32 bit value that need to be assigned.
2484 /// \param NextVA The next 32 bit value that need to be assigned.
2485 /// \param Root The parent DAG node.
2486 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2487 /// glue purposes. In the case the DAG is already using
2488 /// physical register instead of virtual, we should glue
2489 /// our new SDValue to InFlag SDvalue.
2490 /// \return a new SDvalue of size 64bit.
2491 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2492 SDValue &Root, SelectionDAG &DAG,
2493 const SDLoc &Dl, const X86Subtarget &Subtarget,
2494 SDValue *InFlag = nullptr) {
2495 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2496 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2497 assert(VA.getValVT() == MVT::v64i1 &&
2498 "Expecting first location of 64 bit width type");
2499 assert(NextVA.getValVT() == VA.getValVT() &&
2500 "The locations should have the same type");
2501 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2502 "The values should reside in two registers");
2506 SDValue ArgValueLo, ArgValueHi;
2508 MachineFunction &MF = DAG.getMachineFunction();
2509 const TargetRegisterClass *RC = &X86::GR32RegClass;
2511 // Read a 32 bit value from the registers
2512 if (nullptr == InFlag) {
2513 // When no physical register is present,
2514 // create an intermediate virtual register
2515 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2516 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2517 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2518 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2520 // When a physical register is available read the value from it and glue
2521 // the reads together.
2523 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2524 *InFlag = ArgValueLo.getValue(2);
2526 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2527 *InFlag = ArgValueHi.getValue(2);
2530 // Convert the i32 type into v32i1 type
2531 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2533 // Convert the i32 type into v32i1 type
2534 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2536 // Concatenate the two values together
2537 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2540 /// The function will lower a register of various sizes (8/16/32/64)
2541 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2542 /// \returns a DAG node contains the operand after lowering to mask type.
2543 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2544 const EVT &ValLoc, const SDLoc &Dl,
2545 SelectionDAG &DAG) {
2546 SDValue ValReturned = ValArg;
2548 if (ValVT == MVT::v64i1) {
2549 // In 32 bit machine, this case is handled by getv64i1Argument
2550 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2551 // In 64 bit machine, There is no need to truncate the value only bitcast
2554 switch (ValVT.getSimpleVT().SimpleTy) {
2565 llvm_unreachable("Expecting a vector of i1 types");
2568 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2571 return DAG.getBitcast(ValVT, ValReturned);
2574 /// Lower the result values of a call into the
2575 /// appropriate copies out of appropriate physical registers.
2577 SDValue X86TargetLowering::LowerCallResult(
2578 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2579 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2580 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2581 uint32_t *RegMask) const {
2583 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2584 // Assign locations to each value returned by this call.
2585 SmallVector<CCValAssign, 16> RVLocs;
2586 bool Is64Bit = Subtarget.is64Bit();
2587 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2589 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2591 // Copy all of the result registers out of their specified physreg.
2592 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2594 CCValAssign &VA = RVLocs[I];
2595 EVT CopyVT = VA.getLocVT();
2597 // In some calling conventions we need to remove the used registers
2598 // from the register mask.
2599 if (RegMask && CallConv == CallingConv::X86_RegCall) {
2600 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2601 SubRegs.isValid(); ++SubRegs)
2602 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2605 // If this is x86-64, and we disabled SSE, we can't return FP values
2606 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2607 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2608 report_fatal_error("SSE register return with SSE disabled");
2611 // If we prefer to use the value in xmm registers, copy it out as f80 and
2612 // use a truncate to move it from fp stack reg to xmm reg.
2613 bool RoundAfterCopy = false;
2614 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2615 isScalarFPTypeInSSEReg(VA.getValVT())) {
2616 if (!Subtarget.hasX87())
2617 report_fatal_error("X87 register return with X87 disabled");
2619 RoundAfterCopy = (CopyVT != VA.getLocVT());
2623 if (VA.needsCustom()) {
2624 assert(VA.getValVT() == MVT::v64i1 &&
2625 "Currently the only custom case is when we split v64i1 to 2 regs");
2627 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2629 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2631 Val = Chain.getValue(0);
2632 InFlag = Chain.getValue(2);
2636 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2637 // This truncation won't change the value.
2638 DAG.getIntPtrConstant(1, dl));
2640 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2641 if (VA.getValVT().isVector() &&
2642 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2643 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2644 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2645 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2647 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2650 InVals.push_back(Val);
2656 //===----------------------------------------------------------------------===//
2657 // C & StdCall & Fast Calling Convention implementation
2658 //===----------------------------------------------------------------------===//
2659 // StdCall calling convention seems to be standard for many Windows' API
2660 // routines and around. It differs from C calling convention just a little:
2661 // callee should clean up the stack, not caller. Symbols should be also
2662 // decorated in some fancy way :) It doesn't support any vector arguments.
2663 // For info on fast calling convention see Fast Calling Convention (tail call)
2664 // implementation LowerX86_32FastCCCallTo.
2666 /// CallIsStructReturn - Determines whether a call uses struct return
2668 enum StructReturnType {
2673 static StructReturnType
2674 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2676 return NotStructReturn;
2678 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2679 if (!Flags.isSRet())
2680 return NotStructReturn;
2681 if (Flags.isInReg() || IsMCU)
2682 return RegStructReturn;
2683 return StackStructReturn;
2686 /// Determines whether a function uses struct return semantics.
2687 static StructReturnType
2688 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2690 return NotStructReturn;
2692 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2693 if (!Flags.isSRet())
2694 return NotStructReturn;
2695 if (Flags.isInReg() || IsMCU)
2696 return RegStructReturn;
2697 return StackStructReturn;
2700 /// Make a copy of an aggregate at address specified by "Src" to address
2701 /// "Dst" with size and alignment information specified by the specific
2702 /// parameter attribute. The copy will be passed as a byval function parameter.
2703 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2704 SDValue Chain, ISD::ArgFlagsTy Flags,
2705 SelectionDAG &DAG, const SDLoc &dl) {
2706 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2708 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2709 /*isVolatile*/false, /*AlwaysInline=*/true,
2710 /*isTailCall*/false,
2711 MachinePointerInfo(), MachinePointerInfo());
2714 /// Return true if the calling convention is one that we can guarantee TCO for.
2715 static bool canGuaranteeTCO(CallingConv::ID CC) {
2716 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2717 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2718 CC == CallingConv::HHVM);
2721 /// Return true if we might ever do TCO for calls with this calling convention.
2722 static bool mayTailCallThisCC(CallingConv::ID CC) {
2724 // C calling conventions:
2725 case CallingConv::C:
2726 case CallingConv::X86_64_Win64:
2727 case CallingConv::X86_64_SysV:
2728 // Callee pop conventions:
2729 case CallingConv::X86_ThisCall:
2730 case CallingConv::X86_StdCall:
2731 case CallingConv::X86_VectorCall:
2732 case CallingConv::X86_FastCall:
2735 return canGuaranteeTCO(CC);
2739 /// Return true if the function is being made into a tailcall target by
2740 /// changing its ABI.
2741 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2742 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2745 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2747 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2748 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2751 ImmutableCallSite CS(CI);
2752 CallingConv::ID CalleeCC = CS.getCallingConv();
2753 if (!mayTailCallThisCC(CalleeCC))
2760 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2761 const SmallVectorImpl<ISD::InputArg> &Ins,
2762 const SDLoc &dl, SelectionDAG &DAG,
2763 const CCValAssign &VA,
2764 MachineFrameInfo &MFI, unsigned i) const {
2765 // Create the nodes corresponding to a load from this parameter slot.
2766 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2767 bool AlwaysUseMutable = shouldGuaranteeTCO(
2768 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2769 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2771 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2773 // If value is passed by pointer we have address passed instead of the value
2774 // itself. No need to extend if the mask value and location share the same
2776 bool ExtendedInMem =
2777 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2778 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2780 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2781 ValVT = VA.getLocVT();
2783 ValVT = VA.getValVT();
2785 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2786 // taken by a return address.
2788 if (CallConv == CallingConv::X86_INTR) {
2789 // X86 interrupts may take one or two arguments.
2790 // On the stack there will be no return address as in regular call.
2791 // Offset of last argument need to be set to -4/-8 bytes.
2792 // Where offset of the first argument out of two, should be set to 0 bytes.
2793 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2794 if (Subtarget.is64Bit() && Ins.size() == 2) {
2795 // The stack pointer needs to be realigned for 64 bit handlers with error
2796 // code, so the argument offset changes by 8 bytes.
2801 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2802 // changed with more analysis.
2803 // In case of tail call optimization mark all arguments mutable. Since they
2804 // could be overwritten by lowering of arguments in case of a tail call.
2805 if (Flags.isByVal()) {
2806 unsigned Bytes = Flags.getByValSize();
2807 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2808 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2809 // Adjust SP offset of interrupt parameter.
2810 if (CallConv == CallingConv::X86_INTR) {
2811 MFI.setObjectOffset(FI, Offset);
2813 return DAG.getFrameIndex(FI, PtrVT);
2816 // This is an argument in memory. We might be able to perform copy elision.
2817 if (Flags.isCopyElisionCandidate()) {
2818 EVT ArgVT = Ins[i].ArgVT;
2820 if (Ins[i].PartOffset == 0) {
2821 // If this is a one-part value or the first part of a multi-part value,
2822 // create a stack object for the entire argument value type and return a
2823 // load from our portion of it. This assumes that if the first part of an
2824 // argument is in memory, the rest will also be in memory.
2825 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2826 /*Immutable=*/false);
2827 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2829 ValVT, dl, Chain, PartAddr,
2830 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2832 // This is not the first piece of an argument in memory. See if there is
2833 // already a fixed stack object including this offset. If so, assume it
2834 // was created by the PartOffset == 0 branch above and create a load from
2835 // the appropriate offset into it.
2836 int64_t PartBegin = VA.getLocMemOffset();
2837 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2838 int FI = MFI.getObjectIndexBegin();
2839 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2840 int64_t ObjBegin = MFI.getObjectOffset(FI);
2841 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2842 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2845 if (MFI.isFixedObjectIndex(FI)) {
2847 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2848 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2850 ValVT, dl, Chain, Addr,
2851 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2852 Ins[i].PartOffset));
2857 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2858 VA.getLocMemOffset(), isImmutable);
2860 // Set SExt or ZExt flag.
2861 if (VA.getLocInfo() == CCValAssign::ZExt) {
2862 MFI.setObjectZExt(FI, true);
2863 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2864 MFI.setObjectSExt(FI, true);
2867 // Adjust SP offset of interrupt parameter.
2868 if (CallConv == CallingConv::X86_INTR) {
2869 MFI.setObjectOffset(FI, Offset);
2872 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2873 SDValue Val = DAG.getLoad(
2874 ValVT, dl, Chain, FIN,
2875 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2876 return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
2880 // FIXME: Get this from tablegen.
2881 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2882 const X86Subtarget &Subtarget) {
2883 assert(Subtarget.is64Bit());
2885 if (Subtarget.isCallingConvWin64(CallConv)) {
2886 static const MCPhysReg GPR64ArgRegsWin64[] = {
2887 X86::RCX, X86::RDX, X86::R8, X86::R9
2889 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2892 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2893 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2895 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2898 // FIXME: Get this from tablegen.
2899 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2900 CallingConv::ID CallConv,
2901 const X86Subtarget &Subtarget) {
2902 assert(Subtarget.is64Bit());
2903 if (Subtarget.isCallingConvWin64(CallConv)) {
2904 // The XMM registers which might contain var arg parameters are shadowed
2905 // in their paired GPR. So we only need to save the GPR to their home
2907 // TODO: __vectorcall will change this.
2911 const Function *Fn = MF.getFunction();
2912 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2913 bool isSoftFloat = Subtarget.useSoftFloat();
2914 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2915 "SSE register cannot be used when SSE is disabled!");
2916 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2917 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2921 static const MCPhysReg XMMArgRegs64Bit[] = {
2922 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2923 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2925 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2929 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2930 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2931 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2932 return A.getValNo() < B.getValNo();
2937 SDValue X86TargetLowering::LowerFormalArguments(
2938 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2939 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2940 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2941 MachineFunction &MF = DAG.getMachineFunction();
2942 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2943 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2945 const Function *Fn = MF.getFunction();
2946 if (Fn->hasExternalLinkage() &&
2947 Subtarget.isTargetCygMing() &&
2948 Fn->getName() == "main")
2949 FuncInfo->setForceFramePointer(true);
2951 MachineFrameInfo &MFI = MF.getFrameInfo();
2952 bool Is64Bit = Subtarget.is64Bit();
2953 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2956 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2957 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2959 if (CallConv == CallingConv::X86_INTR) {
2960 bool isLegal = Ins.size() == 1 ||
2961 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2962 (!Is64Bit && Ins[1].VT == MVT::i32)));
2964 report_fatal_error("X86 interrupts may take one or two arguments");
2967 // Assign locations to all of the incoming arguments.
2968 SmallVector<CCValAssign, 16> ArgLocs;
2969 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2971 // Allocate shadow area for Win64.
2973 CCInfo.AllocateStack(32, 8);
2975 CCInfo.AnalyzeArguments(Ins, CC_X86);
2977 // In vectorcall calling convention a second pass is required for the HVA
2979 if (CallingConv::X86_VectorCall == CallConv) {
2980 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2983 // The next loop assumes that the locations are in the same order of the
2985 assert(isSortedByValueNo(ArgLocs) &&
2986 "Argument Location list must be sorted before lowering");
2989 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2991 assert(InsIndex < Ins.size() && "Invalid Ins index");
2992 CCValAssign &VA = ArgLocs[I];
2994 if (VA.isRegLoc()) {
2995 EVT RegVT = VA.getLocVT();
2996 if (VA.needsCustom()) {
2998 VA.getValVT() == MVT::v64i1 &&
2999 "Currently the only custom case is when we split v64i1 to 2 regs");
3001 // v64i1 values, in regcall calling convention, that are
3002 // compiled to 32 bit arch, are split up into two registers.
3004 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3006 const TargetRegisterClass *RC;
3007 if (RegVT == MVT::i32)
3008 RC = &X86::GR32RegClass;
3009 else if (Is64Bit && RegVT == MVT::i64)
3010 RC = &X86::GR64RegClass;
3011 else if (RegVT == MVT::f32)
3012 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3013 else if (RegVT == MVT::f64)
3014 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3015 else if (RegVT == MVT::f80)
3016 RC = &X86::RFP80RegClass;
3017 else if (RegVT == MVT::f128)
3018 RC = &X86::FR128RegClass;
3019 else if (RegVT.is512BitVector())
3020 RC = &X86::VR512RegClass;
3021 else if (RegVT.is256BitVector())
3022 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3023 else if (RegVT.is128BitVector())
3024 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3025 else if (RegVT == MVT::x86mmx)
3026 RC = &X86::VR64RegClass;
3027 else if (RegVT == MVT::i1)
3028 RC = &X86::VK1RegClass;
3029 else if (RegVT == MVT::v8i1)
3030 RC = &X86::VK8RegClass;
3031 else if (RegVT == MVT::v16i1)
3032 RC = &X86::VK16RegClass;
3033 else if (RegVT == MVT::v32i1)
3034 RC = &X86::VK32RegClass;
3035 else if (RegVT == MVT::v64i1)
3036 RC = &X86::VK64RegClass;
3038 llvm_unreachable("Unknown argument type!");
3040 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3041 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3044 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3045 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3047 if (VA.getLocInfo() == CCValAssign::SExt)
3048 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3049 DAG.getValueType(VA.getValVT()));
3050 else if (VA.getLocInfo() == CCValAssign::ZExt)
3051 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3052 DAG.getValueType(VA.getValVT()));
3053 else if (VA.getLocInfo() == CCValAssign::BCvt)
3054 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3056 if (VA.isExtInLoc()) {
3057 // Handle MMX values passed in XMM regs.
3058 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3059 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3060 else if (VA.getValVT().isVector() &&
3061 VA.getValVT().getScalarType() == MVT::i1 &&
3062 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3063 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3064 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3065 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3067 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3070 assert(VA.isMemLoc());
3072 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3075 // If value is passed via pointer - do a load.
3076 if (VA.getLocInfo() == CCValAssign::Indirect)
3078 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3080 InVals.push_back(ArgValue);
3083 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3084 // Swift calling convention does not require we copy the sret argument
3085 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3086 if (CallConv == CallingConv::Swift)
3089 // All x86 ABIs require that for returning structs by value we copy the
3090 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3091 // the argument into a virtual register so that we can access it from the
3093 if (Ins[I].Flags.isSRet()) {
3094 unsigned Reg = FuncInfo->getSRetReturnReg();
3096 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3097 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3098 FuncInfo->setSRetReturnReg(Reg);
3100 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3101 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3106 unsigned StackSize = CCInfo.getNextStackOffset();
3107 // Align stack specially for tail calls.
3108 if (shouldGuaranteeTCO(CallConv,
3109 MF.getTarget().Options.GuaranteedTailCallOpt))
3110 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3112 // If the function takes variable number of arguments, make a frame index for
3113 // the start of the first vararg value... for expansion of llvm.va_start. We
3114 // can skip this if there are no va_start calls.
3115 if (MFI.hasVAStart() &&
3116 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3117 CallConv != CallingConv::X86_ThisCall))) {
3118 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3121 // Figure out if XMM registers are in use.
3122 assert(!(Subtarget.useSoftFloat() &&
3123 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3124 "SSE register cannot be used when SSE is disabled!");
3126 // 64-bit calling conventions support varargs and register parameters, so we
3127 // have to do extra work to spill them in the prologue.
3128 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3129 // Find the first unallocated argument registers.
3130 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3131 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3132 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3133 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3134 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3135 "SSE register cannot be used when SSE is disabled!");
3137 // Gather all the live in physical registers.
3138 SmallVector<SDValue, 6> LiveGPRs;
3139 SmallVector<SDValue, 8> LiveXMMRegs;
3141 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3142 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3144 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3146 if (!ArgXMMs.empty()) {
3147 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3148 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3149 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3150 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3151 LiveXMMRegs.push_back(
3152 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3157 // Get to the caller-allocated home save location. Add 8 to account
3158 // for the return address.
3159 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3160 FuncInfo->setRegSaveFrameIndex(
3161 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3162 // Fixup to set vararg frame on shadow area (4 x i64).
3164 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3166 // For X86-64, if there are vararg parameters that are passed via
3167 // registers, then we must store them to their spots on the stack so
3168 // they may be loaded by dereferencing the result of va_next.
3169 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3170 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3171 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3172 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3175 // Store the integer parameter registers.
3176 SmallVector<SDValue, 8> MemOps;
3177 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3178 getPointerTy(DAG.getDataLayout()));
3179 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3180 for (SDValue Val : LiveGPRs) {
3181 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3182 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3184 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3185 MachinePointerInfo::getFixedStack(
3186 DAG.getMachineFunction(),
3187 FuncInfo->getRegSaveFrameIndex(), Offset));
3188 MemOps.push_back(Store);
3192 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3193 // Now store the XMM (fp + vector) parameter registers.
3194 SmallVector<SDValue, 12> SaveXMMOps;
3195 SaveXMMOps.push_back(Chain);
3196 SaveXMMOps.push_back(ALVal);
3197 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3198 FuncInfo->getRegSaveFrameIndex(), dl));
3199 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3200 FuncInfo->getVarArgsFPOffset(), dl));
3201 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3203 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3204 MVT::Other, SaveXMMOps));
3207 if (!MemOps.empty())
3208 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3211 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3212 // Find the largest legal vector type.
3213 MVT VecVT = MVT::Other;
3214 // FIXME: Only some x86_32 calling conventions support AVX512.
3215 if (Subtarget.hasAVX512() &&
3216 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3217 CallConv == CallingConv::Intel_OCL_BI)))
3218 VecVT = MVT::v16f32;
3219 else if (Subtarget.hasAVX())
3221 else if (Subtarget.hasSSE2())
3224 // We forward some GPRs and some vector types.
3225 SmallVector<MVT, 2> RegParmTypes;
3226 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3227 RegParmTypes.push_back(IntVT);
3228 if (VecVT != MVT::Other)
3229 RegParmTypes.push_back(VecVT);
3231 // Compute the set of forwarded registers. The rest are scratch.
3232 SmallVectorImpl<ForwardedRegister> &Forwards =
3233 FuncInfo->getForwardedMustTailRegParms();
3234 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3236 // Conservatively forward AL on x86_64, since it might be used for varargs.
3237 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3238 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3239 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3242 // Copy all forwards from physical to virtual registers.
3243 for (ForwardedRegister &F : Forwards) {
3244 // FIXME: Can we use a less constrained schedule?
3245 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3246 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3247 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3251 // Some CCs need callee pop.
3252 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3253 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3254 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3255 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3256 // X86 interrupts must pop the error code (and the alignment padding) if
3258 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3260 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3261 // If this is an sret function, the return should pop the hidden pointer.
3262 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3263 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3264 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3265 FuncInfo->setBytesToPopOnReturn(4);
3269 // RegSaveFrameIndex is X86-64 only.
3270 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3271 if (CallConv == CallingConv::X86_FastCall ||
3272 CallConv == CallingConv::X86_ThisCall)
3273 // fastcc functions can't have varargs.
3274 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3277 FuncInfo->setArgumentStackSize(StackSize);
3279 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3280 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3281 if (Personality == EHPersonality::CoreCLR) {
3283 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3284 // that we'd prefer this slot be allocated towards the bottom of the frame
3285 // (i.e. near the stack pointer after allocating the frame). Every
3286 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3287 // offset from the bottom of this and each funclet's frame must be the
3288 // same, so the size of funclets' (mostly empty) frames is dictated by
3289 // how far this slot is from the bottom (since they allocate just enough
3290 // space to accommodate holding this slot at the correct offset).
3291 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3292 EHInfo->PSPSymFrameIdx = PSPSymFI;
3296 if (CallConv == CallingConv::X86_RegCall) {
3297 const MachineRegisterInfo &MRI = MF.getRegInfo();
3298 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3299 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3305 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3306 SDValue Arg, const SDLoc &dl,
3308 const CCValAssign &VA,
3309 ISD::ArgFlagsTy Flags) const {
3310 unsigned LocMemOffset = VA.getLocMemOffset();
3311 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3312 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3314 if (Flags.isByVal())
3315 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3317 return DAG.getStore(
3318 Chain, dl, Arg, PtrOff,
3319 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3322 /// Emit a load of return address if tail call
3323 /// optimization is performed and it is required.
3324 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3325 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3326 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3327 // Adjust the Return address stack slot.
3328 EVT VT = getPointerTy(DAG.getDataLayout());
3329 OutRetAddr = getReturnAddressFrameIndex(DAG);
3331 // Load the "old" Return address.
3332 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3333 return SDValue(OutRetAddr.getNode(), 1);
3336 /// Emit a store of the return address if tail call
3337 /// optimization is performed and it is required (FPDiff!=0).
3338 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3339 SDValue Chain, SDValue RetAddrFrIdx,
3340 EVT PtrVT, unsigned SlotSize,
3341 int FPDiff, const SDLoc &dl) {
3342 // Store the return address to the appropriate stack slot.
3343 if (!FPDiff) return Chain;
3344 // Calculate the new stack slot for the return address.
3345 int NewReturnAddrFI =
3346 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3348 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3349 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3350 MachinePointerInfo::getFixedStack(
3351 DAG.getMachineFunction(), NewReturnAddrFI));
3355 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3356 /// operation of specified width.
3357 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3359 unsigned NumElems = VT.getVectorNumElements();
3360 SmallVector<int, 8> Mask;
3361 Mask.push_back(NumElems);
3362 for (unsigned i = 1; i != NumElems; ++i)
3364 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3368 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3369 SmallVectorImpl<SDValue> &InVals) const {
3370 SelectionDAG &DAG = CLI.DAG;
3372 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3373 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3374 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3375 SDValue Chain = CLI.Chain;
3376 SDValue Callee = CLI.Callee;
3377 CallingConv::ID CallConv = CLI.CallConv;
3378 bool &isTailCall = CLI.IsTailCall;
3379 bool isVarArg = CLI.IsVarArg;
3381 MachineFunction &MF = DAG.getMachineFunction();
3382 bool Is64Bit = Subtarget.is64Bit();
3383 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3384 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3385 bool IsSibcall = false;
3386 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3387 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3389 if (CallConv == CallingConv::X86_INTR)
3390 report_fatal_error("X86 interrupts may not be called directly");
3392 if (Attr.getValueAsString() == "true")
3395 if (Subtarget.isPICStyleGOT() &&
3396 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3397 // If we are using a GOT, disable tail calls to external symbols with
3398 // default visibility. Tail calling such a symbol requires using a GOT
3399 // relocation, which forces early binding of the symbol. This breaks code
3400 // that require lazy function symbol resolution. Using musttail or
3401 // GuaranteedTailCallOpt will override this.
3402 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3403 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3404 G->getGlobal()->hasDefaultVisibility()))
3408 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3410 // Force this to be a tail call. The verifier rules are enough to ensure
3411 // that we can lower this successfully without moving the return address
3414 } else if (isTailCall) {
3415 // Check if it's really possible to do a tail call.
3416 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3417 isVarArg, SR != NotStructReturn,
3418 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3419 Outs, OutVals, Ins, DAG);
3421 // Sibcalls are automatically detected tailcalls which do not require
3423 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3430 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3431 "Var args not supported with calling convention fastcc, ghc or hipe");
3433 // Analyze operands of the call, assigning locations to each operand.
3434 SmallVector<CCValAssign, 16> ArgLocs;
3435 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3437 // Allocate shadow area for Win64.
3439 CCInfo.AllocateStack(32, 8);
3441 CCInfo.AnalyzeArguments(Outs, CC_X86);
3443 // In vectorcall calling convention a second pass is required for the HVA
3445 if (CallingConv::X86_VectorCall == CallConv) {
3446 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3449 // Get a count of how many bytes are to be pushed on the stack.
3450 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3452 // This is a sibcall. The memory operands are available in caller's
3453 // own caller's stack.
3455 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3456 canGuaranteeTCO(CallConv))
3457 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3460 if (isTailCall && !IsSibcall && !IsMustTail) {
3461 // Lower arguments at fp - stackoffset + fpdiff.
3462 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3464 FPDiff = NumBytesCallerPushed - NumBytes;
3466 // Set the delta of movement of the returnaddr stackslot.
3467 // But only set if delta is greater than previous delta.
3468 if (FPDiff < X86Info->getTCReturnAddrDelta())
3469 X86Info->setTCReturnAddrDelta(FPDiff);
3472 unsigned NumBytesToPush = NumBytes;
3473 unsigned NumBytesToPop = NumBytes;
3475 // If we have an inalloca argument, all stack space has already been allocated
3476 // for us and be right at the top of the stack. We don't support multiple
3477 // arguments passed in memory when using inalloca.
3478 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3480 if (!ArgLocs.back().isMemLoc())
3481 report_fatal_error("cannot use inalloca attribute on a register "
3483 if (ArgLocs.back().getLocMemOffset() != 0)
3484 report_fatal_error("any parameter with the inalloca attribute must be "
3485 "the only memory argument");
3489 Chain = DAG.getCALLSEQ_START(
3490 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3492 SDValue RetAddrFrIdx;
3493 // Load return address for tail calls.
3494 if (isTailCall && FPDiff)
3495 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3496 Is64Bit, FPDiff, dl);
3498 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3499 SmallVector<SDValue, 8> MemOpChains;
3502 // The next loop assumes that the locations are in the same order of the
3504 assert(isSortedByValueNo(ArgLocs) &&
3505 "Argument Location list must be sorted before lowering");
3507 // Walk the register/memloc assignments, inserting copies/loads. In the case
3508 // of tail call optimization arguments are handle later.
3509 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3510 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3512 assert(OutIndex < Outs.size() && "Invalid Out index");
3513 // Skip inalloca arguments, they have already been written.
3514 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3515 if (Flags.isInAlloca())
3518 CCValAssign &VA = ArgLocs[I];
3519 EVT RegVT = VA.getLocVT();
3520 SDValue Arg = OutVals[OutIndex];
3521 bool isByVal = Flags.isByVal();
3523 // Promote the value if needed.
3524 switch (VA.getLocInfo()) {
3525 default: llvm_unreachable("Unknown loc info!");
3526 case CCValAssign::Full: break;
3527 case CCValAssign::SExt:
3528 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3530 case CCValAssign::ZExt:
3531 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3533 case CCValAssign::AExt:
3534 if (Arg.getValueType().isVector() &&
3535 Arg.getValueType().getVectorElementType() == MVT::i1)
3536 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3537 else if (RegVT.is128BitVector()) {
3538 // Special case: passing MMX values in XMM registers.
3539 Arg = DAG.getBitcast(MVT::i64, Arg);
3540 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3541 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3543 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3545 case CCValAssign::BCvt:
3546 Arg = DAG.getBitcast(RegVT, Arg);
3548 case CCValAssign::Indirect: {
3549 // Store the argument.
3550 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3551 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3552 Chain = DAG.getStore(
3553 Chain, dl, Arg, SpillSlot,
3554 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3560 if (VA.needsCustom()) {
3561 assert(VA.getValVT() == MVT::v64i1 &&
3562 "Currently the only custom case is when we split v64i1 to 2 regs");
3563 // Split v64i1 value into two registers
3564 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3566 } else if (VA.isRegLoc()) {
3567 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3568 if (isVarArg && IsWin64) {
3569 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3570 // shadow reg if callee is a varargs function.
3571 unsigned ShadowReg = 0;
3572 switch (VA.getLocReg()) {
3573 case X86::XMM0: ShadowReg = X86::RCX; break;
3574 case X86::XMM1: ShadowReg = X86::RDX; break;
3575 case X86::XMM2: ShadowReg = X86::R8; break;
3576 case X86::XMM3: ShadowReg = X86::R9; break;
3579 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3581 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3582 assert(VA.isMemLoc());
3583 if (!StackPtr.getNode())
3584 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3585 getPointerTy(DAG.getDataLayout()));
3586 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3587 dl, DAG, VA, Flags));
3591 if (!MemOpChains.empty())
3592 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3594 if (Subtarget.isPICStyleGOT()) {
3595 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3598 RegsToPass.push_back(std::make_pair(
3599 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3600 getPointerTy(DAG.getDataLayout()))));
3602 // If we are tail calling and generating PIC/GOT style code load the
3603 // address of the callee into ECX. The value in ecx is used as target of
3604 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3605 // for tail calls on PIC/GOT architectures. Normally we would just put the
3606 // address of GOT into ebx and then call target@PLT. But for tail calls
3607 // ebx would be restored (since ebx is callee saved) before jumping to the
3610 // Note: The actual moving to ECX is done further down.
3611 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3612 if (G && !G->getGlobal()->hasLocalLinkage() &&
3613 G->getGlobal()->hasDefaultVisibility())
3614 Callee = LowerGlobalAddress(Callee, DAG);
3615 else if (isa<ExternalSymbolSDNode>(Callee))
3616 Callee = LowerExternalSymbol(Callee, DAG);
3620 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3621 // From AMD64 ABI document:
3622 // For calls that may call functions that use varargs or stdargs
3623 // (prototype-less calls or calls to functions containing ellipsis (...) in
3624 // the declaration) %al is used as hidden argument to specify the number
3625 // of SSE registers used. The contents of %al do not need to match exactly
3626 // the number of registers, but must be an ubound on the number of SSE
3627 // registers used and is in the range 0 - 8 inclusive.
3629 // Count the number of XMM registers allocated.
3630 static const MCPhysReg XMMArgRegs[] = {
3631 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3632 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3634 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3635 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3636 && "SSE registers cannot be used when SSE is disabled");
3638 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3639 DAG.getConstant(NumXMMRegs, dl,
3643 if (isVarArg && IsMustTail) {
3644 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3645 for (const auto &F : Forwards) {
3646 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3647 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3651 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3652 // don't need this because the eligibility check rejects calls that require
3653 // shuffling arguments passed in memory.
3654 if (!IsSibcall && isTailCall) {
3655 // Force all the incoming stack arguments to be loaded from the stack
3656 // before any new outgoing arguments are stored to the stack, because the
3657 // outgoing stack slots may alias the incoming argument stack slots, and
3658 // the alias isn't otherwise explicit. This is slightly more conservative
3659 // than necessary, because it means that each store effectively depends
3660 // on every argument instead of just those arguments it would clobber.
3661 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3663 SmallVector<SDValue, 8> MemOpChains2;
3666 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3668 CCValAssign &VA = ArgLocs[I];
3670 if (VA.isRegLoc()) {
3671 if (VA.needsCustom()) {
3672 assert((CallConv == CallingConv::X86_RegCall) &&
3673 "Expecting custom case only in regcall calling convention");
3674 // This means that we are in special case where one argument was
3675 // passed through two register locations - Skip the next location
3682 assert(VA.isMemLoc());
3683 SDValue Arg = OutVals[OutsIndex];
3684 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3685 // Skip inalloca arguments. They don't require any work.
3686 if (Flags.isInAlloca())
3688 // Create frame index.
3689 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3690 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3691 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3692 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3694 if (Flags.isByVal()) {
3695 // Copy relative to framepointer.
3696 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3697 if (!StackPtr.getNode())
3698 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3699 getPointerTy(DAG.getDataLayout()));
3700 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3703 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3707 // Store relative to framepointer.
3708 MemOpChains2.push_back(DAG.getStore(
3709 ArgChain, dl, Arg, FIN,
3710 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3714 if (!MemOpChains2.empty())
3715 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3717 // Store the return address to the appropriate stack slot.
3718 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3719 getPointerTy(DAG.getDataLayout()),
3720 RegInfo->getSlotSize(), FPDiff, dl);
3723 // Build a sequence of copy-to-reg nodes chained together with token chain
3724 // and flag operands which copy the outgoing args into registers.
3726 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3727 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3728 RegsToPass[i].second, InFlag);
3729 InFlag = Chain.getValue(1);
3732 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3733 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3734 // In the 64-bit large code model, we have to make all calls
3735 // through a register, since the call instruction's 32-bit
3736 // pc-relative offset may not be large enough to hold the whole
3738 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3739 // If the callee is a GlobalAddress node (quite common, every direct call
3740 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3742 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3744 // We should use extra load for direct calls to dllimported functions in
3746 const GlobalValue *GV = G->getGlobal();
3747 if (!GV->hasDLLImportStorageClass()) {
3748 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3750 Callee = DAG.getTargetGlobalAddress(
3751 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3753 if (OpFlags == X86II::MO_GOTPCREL) {
3755 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3756 getPointerTy(DAG.getDataLayout()), Callee);
3757 // Add extra indirection
3758 Callee = DAG.getLoad(
3759 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3760 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3763 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3764 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3765 unsigned char OpFlags =
3766 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3768 Callee = DAG.getTargetExternalSymbol(
3769 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3770 } else if (Subtarget.isTarget64BitILP32() &&
3771 Callee->getValueType(0) == MVT::i32) {
3772 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3773 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3776 // Returns a chain & a flag for retval copy to use.
3777 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3778 SmallVector<SDValue, 8> Ops;
3780 if (!IsSibcall && isTailCall) {
3781 Chain = DAG.getCALLSEQ_END(Chain,
3782 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3783 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3784 InFlag = Chain.getValue(1);
3787 Ops.push_back(Chain);
3788 Ops.push_back(Callee);
3791 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3793 // Add argument registers to the end of the list so that they are known live
3795 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3796 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3797 RegsToPass[i].second.getValueType()));
3799 // Add a register mask operand representing the call-preserved registers.
3800 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3801 assert(Mask && "Missing call preserved mask for calling convention");
3803 // If this is an invoke in a 32-bit function using a funclet-based
3804 // personality, assume the function clobbers all registers. If an exception
3805 // is thrown, the runtime will not restore CSRs.
3806 // FIXME: Model this more precisely so that we can register allocate across
3807 // the normal edge and spill and fill across the exceptional edge.
3808 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3809 const Function *CallerFn = MF.getFunction();
3810 EHPersonality Pers =
3811 CallerFn->hasPersonalityFn()
3812 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3813 : EHPersonality::Unknown;
3814 if (isFuncletEHPersonality(Pers))
3815 Mask = RegInfo->getNoPreservedMask();
3818 // Define a new register mask from the existing mask.
3819 uint32_t *RegMask = nullptr;
3821 // In some calling conventions we need to remove the used physical registers
3822 // from the reg mask.
3823 if (CallConv == CallingConv::X86_RegCall) {
3824 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3826 // Allocate a new Reg Mask and copy Mask.
3827 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3828 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3829 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3831 // Make sure all sub registers of the argument registers are reset
3833 for (auto const &RegPair : RegsToPass)
3834 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3835 SubRegs.isValid(); ++SubRegs)
3836 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3838 // Create the RegMask Operand according to our updated mask.
3839 Ops.push_back(DAG.getRegisterMask(RegMask));
3841 // Create the RegMask Operand according to the static mask.
3842 Ops.push_back(DAG.getRegisterMask(Mask));
3845 if (InFlag.getNode())
3846 Ops.push_back(InFlag);
3850 //// If this is the first return lowered for this function, add the regs
3851 //// to the liveout set for the function.
3852 // This isn't right, although it's probably harmless on x86; liveouts
3853 // should be computed from returns not tail calls. Consider a void
3854 // function making a tail call to a function returning int.
3855 MF.getFrameInfo().setHasTailCall();
3856 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3859 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3860 InFlag = Chain.getValue(1);
3862 // Create the CALLSEQ_END node.
3863 unsigned NumBytesForCalleeToPop;
3864 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3865 DAG.getTarget().Options.GuaranteedTailCallOpt))
3866 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3867 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3868 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3869 SR == StackStructReturn)
3870 // If this is a call to a struct-return function, the callee
3871 // pops the hidden struct pointer, so we have to push it back.
3872 // This is common for Darwin/X86, Linux & Mingw32 targets.
3873 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3874 NumBytesForCalleeToPop = 4;
3876 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3878 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3879 // No need to reset the stack after the call if the call doesn't return. To
3880 // make the MI verify, we'll pretend the callee does it for us.
3881 NumBytesForCalleeToPop = NumBytes;
3884 // Returns a flag for retval copy to use.
3886 Chain = DAG.getCALLSEQ_END(Chain,
3887 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3888 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3891 InFlag = Chain.getValue(1);
3894 // Handle result values, copying them out of physregs into vregs that we
3896 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3900 //===----------------------------------------------------------------------===//
3901 // Fast Calling Convention (tail call) implementation
3902 //===----------------------------------------------------------------------===//
3904 // Like std call, callee cleans arguments, convention except that ECX is
3905 // reserved for storing the tail called function address. Only 2 registers are
3906 // free for argument passing (inreg). Tail call optimization is performed
3908 // * tailcallopt is enabled
3909 // * caller/callee are fastcc
3910 // On X86_64 architecture with GOT-style position independent code only local
3911 // (within module) calls are supported at the moment.
3912 // To keep the stack aligned according to platform abi the function
3913 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3914 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3915 // If a tail called function callee has more arguments than the caller the
3916 // caller needs to make sure that there is room to move the RETADDR to. This is
3917 // achieved by reserving an area the size of the argument delta right after the
3918 // original RETADDR, but before the saved framepointer or the spilled registers
3919 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3931 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3934 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3935 SelectionDAG& DAG) const {
3936 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3937 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3938 unsigned StackAlignment = TFI.getStackAlignment();
3939 uint64_t AlignMask = StackAlignment - 1;
3940 int64_t Offset = StackSize;
3941 unsigned SlotSize = RegInfo->getSlotSize();
3942 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3943 // Number smaller than 12 so just add the difference.
3944 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3946 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3947 Offset = ((~AlignMask) & Offset) + StackAlignment +
3948 (StackAlignment-SlotSize);
3953 /// Return true if the given stack call argument is already available in the
3954 /// same position (relatively) of the caller's incoming argument stack.
3956 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3957 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3958 const X86InstrInfo *TII, const CCValAssign &VA) {
3959 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3962 // Look through nodes that don't alter the bits of the incoming value.
3963 unsigned Op = Arg.getOpcode();
3964 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3965 Arg = Arg.getOperand(0);
3968 if (Op == ISD::TRUNCATE) {
3969 const SDValue &TruncInput = Arg.getOperand(0);
3970 if (TruncInput.getOpcode() == ISD::AssertZext &&
3971 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3972 Arg.getValueType()) {
3973 Arg = TruncInput.getOperand(0);
3981 if (Arg.getOpcode() == ISD::CopyFromReg) {
3982 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3983 if (!TargetRegisterInfo::isVirtualRegister(VR))
3985 MachineInstr *Def = MRI->getVRegDef(VR);
3988 if (!Flags.isByVal()) {
3989 if (!TII->isLoadFromStackSlot(*Def, FI))
3992 unsigned Opcode = Def->getOpcode();
3993 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3994 Opcode == X86::LEA64_32r) &&
3995 Def->getOperand(1).isFI()) {
3996 FI = Def->getOperand(1).getIndex();
3997 Bytes = Flags.getByValSize();
4001 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4002 if (Flags.isByVal())
4003 // ByVal argument is passed in as a pointer but it's now being
4004 // dereferenced. e.g.
4005 // define @foo(%struct.X* %A) {
4006 // tail call @bar(%struct.X* byval %A)
4009 SDValue Ptr = Ld->getBasePtr();
4010 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4013 FI = FINode->getIndex();
4014 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4015 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4016 FI = FINode->getIndex();
4017 Bytes = Flags.getByValSize();
4021 assert(FI != INT_MAX);
4022 if (!MFI.isFixedObjectIndex(FI))
4025 if (Offset != MFI.getObjectOffset(FI))
4028 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4029 // If the argument location is wider than the argument type, check that any
4030 // extension flags match.
4031 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4032 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4037 return Bytes == MFI.getObjectSize(FI);
4040 /// Check whether the call is eligible for tail call optimization. Targets
4041 /// that want to do tail call optimization should implement this function.
4042 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4043 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4044 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4045 const SmallVectorImpl<ISD::OutputArg> &Outs,
4046 const SmallVectorImpl<SDValue> &OutVals,
4047 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4048 if (!mayTailCallThisCC(CalleeCC))
4051 // If -tailcallopt is specified, make fastcc functions tail-callable.
4052 MachineFunction &MF = DAG.getMachineFunction();
4053 const Function *CallerF = MF.getFunction();
4055 // If the function return type is x86_fp80 and the callee return type is not,
4056 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4057 // perform a tailcall optimization here.
4058 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4061 CallingConv::ID CallerCC = CallerF->getCallingConv();
4062 bool CCMatch = CallerCC == CalleeCC;
4063 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4064 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4066 // Win64 functions have extra shadow space for argument homing. Don't do the
4067 // sibcall if the caller and callee have mismatched expectations for this
4069 if (IsCalleeWin64 != IsCallerWin64)
4072 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4073 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4078 // Look for obvious safe cases to perform tail call optimization that do not
4079 // require ABI changes. This is what gcc calls sibcall.
4081 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4082 // emit a special epilogue.
4083 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4084 if (RegInfo->needsStackRealignment(MF))
4087 // Also avoid sibcall optimization if either caller or callee uses struct
4088 // return semantics.
4089 if (isCalleeStructRet || isCallerStructRet)
4092 // Do not sibcall optimize vararg calls unless all arguments are passed via
4094 LLVMContext &C = *DAG.getContext();
4095 if (isVarArg && !Outs.empty()) {
4096 // Optimizing for varargs on Win64 is unlikely to be safe without
4097 // additional testing.
4098 if (IsCalleeWin64 || IsCallerWin64)
4101 SmallVector<CCValAssign, 16> ArgLocs;
4102 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4104 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4105 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4106 if (!ArgLocs[i].isRegLoc())
4110 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4111 // stack. Therefore, if it's not used by the call it is not safe to optimize
4112 // this into a sibcall.
4113 bool Unused = false;
4114 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4121 SmallVector<CCValAssign, 16> RVLocs;
4122 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4123 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4124 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4125 CCValAssign &VA = RVLocs[i];
4126 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4131 // Check that the call results are passed in the same way.
4132 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4133 RetCC_X86, RetCC_X86))
4135 // The callee has to preserve all registers the caller needs to preserve.
4136 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4137 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4139 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4140 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4144 unsigned StackArgsSize = 0;
4146 // If the callee takes no arguments then go on to check the results of the
4148 if (!Outs.empty()) {
4149 // Check if stack adjustment is needed. For now, do not do this if any
4150 // argument is passed on the stack.
4151 SmallVector<CCValAssign, 16> ArgLocs;
4152 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4154 // Allocate shadow area for Win64
4156 CCInfo.AllocateStack(32, 8);
4158 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4159 StackArgsSize = CCInfo.getNextStackOffset();
4161 if (CCInfo.getNextStackOffset()) {
4162 // Check if the arguments are already laid out in the right way as
4163 // the caller's fixed stack objects.
4164 MachineFrameInfo &MFI = MF.getFrameInfo();
4165 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4166 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4167 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4168 CCValAssign &VA = ArgLocs[i];
4169 SDValue Arg = OutVals[i];
4170 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4171 if (VA.getLocInfo() == CCValAssign::Indirect)
4173 if (!VA.isRegLoc()) {
4174 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4181 bool PositionIndependent = isPositionIndependent();
4182 // If the tailcall address may be in a register, then make sure it's
4183 // possible to register allocate for it. In 32-bit, the call address can
4184 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4185 // callee-saved registers are restored. These happen to be the same
4186 // registers used to pass 'inreg' arguments so watch out for those.
4187 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4188 !isa<ExternalSymbolSDNode>(Callee)) ||
4189 PositionIndependent)) {
4190 unsigned NumInRegs = 0;
4191 // In PIC we need an extra register to formulate the address computation
4193 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4195 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4196 CCValAssign &VA = ArgLocs[i];
4199 unsigned Reg = VA.getLocReg();
4202 case X86::EAX: case X86::EDX: case X86::ECX:
4203 if (++NumInRegs == MaxInRegs)
4210 const MachineRegisterInfo &MRI = MF.getRegInfo();
4211 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4215 bool CalleeWillPop =
4216 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4217 MF.getTarget().Options.GuaranteedTailCallOpt);
4219 if (unsigned BytesToPop =
4220 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4221 // If we have bytes to pop, the callee must pop them.
4222 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4223 if (!CalleePopMatches)
4225 } else if (CalleeWillPop && StackArgsSize > 0) {
4226 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4234 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4235 const TargetLibraryInfo *libInfo) const {
4236 return X86::createFastISel(funcInfo, libInfo);
4239 //===----------------------------------------------------------------------===//
4240 // Other Lowering Hooks
4241 //===----------------------------------------------------------------------===//
4243 static bool MayFoldLoad(SDValue Op) {
4244 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4247 static bool MayFoldIntoStore(SDValue Op) {
4248 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4251 static bool MayFoldIntoZeroExtend(SDValue Op) {
4252 if (Op.hasOneUse()) {
4253 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4254 return (ISD::ZERO_EXTEND == Opcode);
4259 static bool isTargetShuffle(unsigned Opcode) {
4261 default: return false;
4262 case X86ISD::BLENDI:
4263 case X86ISD::PSHUFB:
4264 case X86ISD::PSHUFD:
4265 case X86ISD::PSHUFHW:
4266 case X86ISD::PSHUFLW:
4268 case X86ISD::INSERTPS:
4269 case X86ISD::PALIGNR:
4270 case X86ISD::VSHLDQ:
4271 case X86ISD::VSRLDQ:
4272 case X86ISD::MOVLHPS:
4273 case X86ISD::MOVLHPD:
4274 case X86ISD::MOVHLPS:
4275 case X86ISD::MOVLPS:
4276 case X86ISD::MOVLPD:
4277 case X86ISD::MOVSHDUP:
4278 case X86ISD::MOVSLDUP:
4279 case X86ISD::MOVDDUP:
4282 case X86ISD::UNPCKL:
4283 case X86ISD::UNPCKH:
4284 case X86ISD::VBROADCAST:
4285 case X86ISD::VPERMILPI:
4286 case X86ISD::VPERMILPV:
4287 case X86ISD::VPERM2X128:
4288 case X86ISD::VPERMIL2:
4289 case X86ISD::VPERMI:
4290 case X86ISD::VPPERM:
4291 case X86ISD::VPERMV:
4292 case X86ISD::VPERMV3:
4293 case X86ISD::VPERMIV3:
4294 case X86ISD::VZEXT_MOVL:
4299 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4301 default: return false;
4303 case X86ISD::PSHUFB:
4304 case X86ISD::VPERMILPV:
4305 case X86ISD::VPERMIL2:
4306 case X86ISD::VPPERM:
4307 case X86ISD::VPERMV:
4308 case X86ISD::VPERMV3:
4309 case X86ISD::VPERMIV3:
4311 // 'Faux' Target Shuffles.
4318 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4319 MachineFunction &MF = DAG.getMachineFunction();
4320 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4321 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4322 int ReturnAddrIndex = FuncInfo->getRAIndex();
4324 if (ReturnAddrIndex == 0) {
4325 // Set up a frame object for the return address.
4326 unsigned SlotSize = RegInfo->getSlotSize();
4327 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4330 FuncInfo->setRAIndex(ReturnAddrIndex);
4333 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4336 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4337 bool hasSymbolicDisplacement) {
4338 // Offset should fit into 32 bit immediate field.
4339 if (!isInt<32>(Offset))
4342 // If we don't have a symbolic displacement - we don't have any extra
4344 if (!hasSymbolicDisplacement)
4347 // FIXME: Some tweaks might be needed for medium code model.
4348 if (M != CodeModel::Small && M != CodeModel::Kernel)
4351 // For small code model we assume that latest object is 16MB before end of 31
4352 // bits boundary. We may also accept pretty large negative constants knowing
4353 // that all objects are in the positive half of address space.
4354 if (M == CodeModel::Small && Offset < 16*1024*1024)
4357 // For kernel code model we know that all object resist in the negative half
4358 // of 32bits address space. We may not accept negative offsets, since they may
4359 // be just off and we may accept pretty large positive ones.
4360 if (M == CodeModel::Kernel && Offset >= 0)
4366 /// Determines whether the callee is required to pop its own arguments.
4367 /// Callee pop is necessary to support tail calls.
4368 bool X86::isCalleePop(CallingConv::ID CallingConv,
4369 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4370 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4371 // can guarantee TCO.
4372 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4375 switch (CallingConv) {
4378 case CallingConv::X86_StdCall:
4379 case CallingConv::X86_FastCall:
4380 case CallingConv::X86_ThisCall:
4381 case CallingConv::X86_VectorCall:
4386 /// \brief Return true if the condition is an unsigned comparison operation.
4387 static bool isX86CCUnsigned(unsigned X86CC) {
4390 llvm_unreachable("Invalid integer condition!");
4406 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4407 switch (SetCCOpcode) {
4408 default: llvm_unreachable("Invalid integer condition!");
4409 case ISD::SETEQ: return X86::COND_E;
4410 case ISD::SETGT: return X86::COND_G;
4411 case ISD::SETGE: return X86::COND_GE;
4412 case ISD::SETLT: return X86::COND_L;
4413 case ISD::SETLE: return X86::COND_LE;
4414 case ISD::SETNE: return X86::COND_NE;
4415 case ISD::SETULT: return X86::COND_B;
4416 case ISD::SETUGT: return X86::COND_A;
4417 case ISD::SETULE: return X86::COND_BE;
4418 case ISD::SETUGE: return X86::COND_AE;
4422 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4423 /// condition code, returning the condition code and the LHS/RHS of the
4424 /// comparison to make.
4425 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4426 bool isFP, SDValue &LHS, SDValue &RHS,
4427 SelectionDAG &DAG) {
4429 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4430 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4431 // X > -1 -> X == 0, jump !sign.
4432 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4433 return X86::COND_NS;
4435 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4436 // X < 0 -> X == 0, jump on sign.
4439 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4441 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4442 return X86::COND_LE;
4446 return TranslateIntegerX86CC(SetCCOpcode);
4449 // First determine if it is required or is profitable to flip the operands.
4451 // If LHS is a foldable load, but RHS is not, flip the condition.
4452 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4453 !ISD::isNON_EXTLoad(RHS.getNode())) {
4454 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4455 std::swap(LHS, RHS);
4458 switch (SetCCOpcode) {
4464 std::swap(LHS, RHS);
4468 // On a floating point condition, the flags are set as follows:
4470 // 0 | 0 | 0 | X > Y
4471 // 0 | 0 | 1 | X < Y
4472 // 1 | 0 | 0 | X == Y
4473 // 1 | 1 | 1 | unordered
4474 switch (SetCCOpcode) {
4475 default: llvm_unreachable("Condcode should be pre-legalized away");
4477 case ISD::SETEQ: return X86::COND_E;
4478 case ISD::SETOLT: // flipped
4480 case ISD::SETGT: return X86::COND_A;
4481 case ISD::SETOLE: // flipped
4483 case ISD::SETGE: return X86::COND_AE;
4484 case ISD::SETUGT: // flipped
4486 case ISD::SETLT: return X86::COND_B;
4487 case ISD::SETUGE: // flipped
4489 case ISD::SETLE: return X86::COND_BE;
4491 case ISD::SETNE: return X86::COND_NE;
4492 case ISD::SETUO: return X86::COND_P;
4493 case ISD::SETO: return X86::COND_NP;
4495 case ISD::SETUNE: return X86::COND_INVALID;
4499 /// Is there a floating point cmov for the specific X86 condition code?
4500 /// Current x86 isa includes the following FP cmov instructions:
4501 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4502 static bool hasFPCMov(unsigned X86CC) {
4519 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4521 unsigned Intrinsic) const {
4523 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4527 Info.opc = ISD::INTRINSIC_W_CHAIN;
4528 Info.readMem = false;
4529 Info.writeMem = false;
4533 switch (IntrData->Type) {
4534 case EXPAND_FROM_MEM: {
4535 Info.ptrVal = I.getArgOperand(0);
4536 Info.memVT = MVT::getVT(I.getType());
4538 Info.readMem = true;
4541 case COMPRESS_TO_MEM: {
4542 Info.ptrVal = I.getArgOperand(0);
4543 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4545 Info.writeMem = true;
4548 case TRUNCATE_TO_MEM_VI8:
4549 case TRUNCATE_TO_MEM_VI16:
4550 case TRUNCATE_TO_MEM_VI32: {
4551 Info.ptrVal = I.getArgOperand(0);
4552 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4553 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4554 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4556 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4557 ScalarVT = MVT::i16;
4558 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4559 ScalarVT = MVT::i32;
4561 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4563 Info.writeMem = true;
4573 /// Returns true if the target can instruction select the
4574 /// specified FP immediate natively. If false, the legalizer will
4575 /// materialize the FP immediate as a load from a constant pool.
4576 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4577 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4578 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4584 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4585 ISD::LoadExtType ExtTy,
4587 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4588 // relocation target a movq or addq instruction: don't let the load shrink.
4589 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4590 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4591 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4592 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4596 /// \brief Returns true if it is beneficial to convert a load of a constant
4597 /// to just the constant itself.
4598 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4600 assert(Ty->isIntegerTy());
4602 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4603 if (BitSize == 0 || BitSize > 64)
4608 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4609 unsigned Index) const {
4610 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4613 return (Index == 0 || Index == ResVT.getVectorNumElements());
4616 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4617 // Speculate cttz only if we can directly use TZCNT.
4618 return Subtarget.hasBMI();
4621 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4622 // Speculate ctlz only if we can directly use LZCNT.
4623 return Subtarget.hasLZCNT();
4626 bool X86TargetLowering::isCtlzFast() const {
4627 return Subtarget.hasFastLZCNT();
4630 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4631 const Instruction &AndI) const {
4635 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4636 if (!Subtarget.hasBMI())
4639 // There are only 32-bit and 64-bit forms for 'andn'.
4640 EVT VT = Y.getValueType();
4641 if (VT != MVT::i32 && VT != MVT::i64)
4647 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4648 MVT VT = MVT::getIntegerVT(NumBits);
4649 if (isTypeLegal(VT))
4652 // PMOVMSKB can handle this.
4653 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4656 // VPMOVMSKB can handle this.
4657 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4660 // TODO: Allow 64-bit type for 32-bit target.
4661 // TODO: 512-bit types should be allowed, but make sure that those
4662 // cases are handled in combineVectorSizedSetCCEquality().
4664 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4667 /// Val is the undef sentinel value or equal to the specified value.
4668 static bool isUndefOrEqual(int Val, int CmpVal) {
4669 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4672 /// Val is either the undef or zero sentinel value.
4673 static bool isUndefOrZero(int Val) {
4674 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4677 /// Return true if every element in Mask, beginning
4678 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4679 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4680 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4681 if (Mask[i] != SM_SentinelUndef)
4686 /// Return true if Val is undef or if its value falls within the
4687 /// specified range (L, H].
4688 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4689 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4692 /// Return true if every element in Mask is undef or if its value
4693 /// falls within the specified range (L, H].
4694 static bool isUndefOrInRange(ArrayRef<int> Mask,
4697 if (!isUndefOrInRange(M, Low, Hi))
4702 /// Return true if Val is undef, zero or if its value falls within the
4703 /// specified range (L, H].
4704 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4705 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4708 /// Return true if every element in Mask is undef, zero or if its value
4709 /// falls within the specified range (L, H].
4710 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4712 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4717 /// Return true if every element in Mask, beginning
4718 /// from position Pos and ending in Pos+Size, falls within the specified
4719 /// sequential range (Low, Low+Size]. or is undef.
4720 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4721 unsigned Pos, unsigned Size, int Low) {
4722 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4723 if (!isUndefOrEqual(Mask[i], Low))
4728 /// Return true if every element in Mask, beginning
4729 /// from position Pos and ending in Pos+Size, falls within the specified
4730 /// sequential range (Low, Low+Size], or is undef or is zero.
4731 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4732 unsigned Size, int Low) {
4733 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4734 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4739 /// Return true if every element in Mask, beginning
4740 /// from position Pos and ending in Pos+Size is undef or is zero.
4741 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4743 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4744 if (!isUndefOrZero(Mask[i]))
4749 /// \brief Helper function to test whether a shuffle mask could be
4750 /// simplified by widening the elements being shuffled.
4752 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4753 /// leaves it in an unspecified state.
4755 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4756 /// shuffle masks. The latter have the special property of a '-2' representing
4757 /// a zero-ed lane of a vector.
4758 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4759 SmallVectorImpl<int> &WidenedMask) {
4760 WidenedMask.assign(Mask.size() / 2, 0);
4761 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4763 int M1 = Mask[i + 1];
4765 // If both elements are undef, its trivial.
4766 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4767 WidenedMask[i / 2] = SM_SentinelUndef;
4771 // Check for an undef mask and a mask value properly aligned to fit with
4772 // a pair of values. If we find such a case, use the non-undef mask's value.
4773 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4774 WidenedMask[i / 2] = M1 / 2;
4777 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4778 WidenedMask[i / 2] = M0 / 2;
4782 // When zeroing, we need to spread the zeroing across both lanes to widen.
4783 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4784 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4785 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4786 WidenedMask[i / 2] = SM_SentinelZero;
4792 // Finally check if the two mask values are adjacent and aligned with
4794 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4795 WidenedMask[i / 2] = M0 / 2;
4799 // Otherwise we can't safely widen the elements used in this shuffle.
4802 assert(WidenedMask.size() == Mask.size() / 2 &&
4803 "Incorrect size of mask after widening the elements!");
4808 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4809 /// mask index with the scaled sequential indices for an equivalent narrowed
4810 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4812 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4813 SmallVectorImpl<int> &ScaledMask) {
4814 assert(0 < Scale && "Unexpected scaling factor");
4815 int NumElts = Mask.size();
4816 ScaledMask.assign(NumElts * Scale, -1);
4818 for (int i = 0; i != NumElts; ++i) {
4821 // Repeat sentinel values in every mask element.
4823 for (int s = 0; s != Scale; ++s)
4824 ScaledMask[(Scale * i) + s] = M;
4828 // Scale mask element and increment across each mask element.
4829 for (int s = 0; s != Scale; ++s)
4830 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4834 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4835 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4836 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4837 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4838 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4841 // The index should be aligned on a vecWidth-bit boundary.
4843 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4845 MVT VT = N->getSimpleValueType(0);
4846 unsigned ElSize = VT.getScalarSizeInBits();
4847 bool Result = (Index * ElSize) % vecWidth == 0;
4852 /// Return true if the specified INSERT_SUBVECTOR
4853 /// operand specifies a subvector insert that is suitable for input to
4854 /// insertion of 128 or 256-bit subvectors
4855 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4856 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4857 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4859 // The index should be aligned on a vecWidth-bit boundary.
4861 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4863 MVT VT = N->getSimpleValueType(0);
4864 unsigned ElSize = VT.getScalarSizeInBits();
4865 bool Result = (Index * ElSize) % vecWidth == 0;
4870 bool X86::isVINSERT128Index(SDNode *N) {
4871 return isVINSERTIndex(N, 128);
4874 bool X86::isVINSERT256Index(SDNode *N) {
4875 return isVINSERTIndex(N, 256);
4878 bool X86::isVEXTRACT128Index(SDNode *N) {
4879 return isVEXTRACTIndex(N, 128);
4882 bool X86::isVEXTRACT256Index(SDNode *N) {
4883 return isVEXTRACTIndex(N, 256);
4886 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4887 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4888 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4889 "Illegal extract subvector for VEXTRACT");
4892 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4894 MVT VecVT = N->getOperand(0).getSimpleValueType();
4895 MVT ElVT = VecVT.getVectorElementType();
4897 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4898 return Index / NumElemsPerChunk;
4901 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4902 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4903 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4904 "Illegal insert subvector for VINSERT");
4907 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4909 MVT VecVT = N->getSimpleValueType(0);
4910 MVT ElVT = VecVT.getVectorElementType();
4912 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4913 return Index / NumElemsPerChunk;
4916 /// Return the appropriate immediate to extract the specified
4917 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4918 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4919 return getExtractVEXTRACTImmediate(N, 128);
4922 /// Return the appropriate immediate to extract the specified
4923 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4924 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4925 return getExtractVEXTRACTImmediate(N, 256);
4928 /// Return the appropriate immediate to insert at the specified
4929 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4930 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4931 return getInsertVINSERTImmediate(N, 128);
4934 /// Return the appropriate immediate to insert at the specified
4935 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4936 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4937 return getInsertVINSERTImmediate(N, 256);
4940 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4941 bool X86::isZeroNode(SDValue Elt) {
4942 return isNullConstant(Elt) || isNullFPConstant(Elt);
4945 // Build a vector of constants
4946 // Use an UNDEF node if MaskElt == -1.
4947 // Spilt 64-bit constants in the 32-bit mode.
4948 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4949 const SDLoc &dl, bool IsMask = false) {
4951 SmallVector<SDValue, 32> Ops;
4954 MVT ConstVecVT = VT;
4955 unsigned NumElts = VT.getVectorNumElements();
4956 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4957 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4958 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4962 MVT EltVT = ConstVecVT.getVectorElementType();
4963 for (unsigned i = 0; i < NumElts; ++i) {
4964 bool IsUndef = Values[i] < 0 && IsMask;
4965 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4966 DAG.getConstant(Values[i], dl, EltVT);
4967 Ops.push_back(OpNode);
4969 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4970 DAG.getConstant(0, dl, EltVT));
4972 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4978 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4979 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4980 assert(Bits.size() == Undefs.getBitWidth() &&
4981 "Unequal constant and undef arrays");
4982 SmallVector<SDValue, 32> Ops;
4985 MVT ConstVecVT = VT;
4986 unsigned NumElts = VT.getVectorNumElements();
4987 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4988 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4989 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4993 MVT EltVT = ConstVecVT.getVectorElementType();
4994 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4996 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4999 const APInt &V = Bits[i];
5000 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5002 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5003 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5004 } else if (EltVT == MVT::f32) {
5005 APFloat FV(APFloat::IEEEsingle(), V);
5006 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5007 } else if (EltVT == MVT::f64) {
5008 APFloat FV(APFloat::IEEEdouble(), V);
5009 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5011 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5015 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5016 return DAG.getBitcast(VT, ConstsNode);
5019 /// Returns a vector of specified type with all zero elements.
5020 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5021 SelectionDAG &DAG, const SDLoc &dl) {
5022 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5023 VT.getVectorElementType() == MVT::i1) &&
5024 "Unexpected vector type");
5026 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5027 // type. This ensures they get CSE'd. But if the integer type is not
5028 // available, use a floating-point +0.0 instead.
5030 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5031 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5032 } else if (VT.getVectorElementType() == MVT::i1) {
5033 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5034 "Unexpected vector type");
5035 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
5036 "Unexpected vector type");
5037 Vec = DAG.getConstant(0, dl, VT);
5039 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5040 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5042 return DAG.getBitcast(VT, Vec);
5045 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5046 const SDLoc &dl, unsigned vectorWidth) {
5047 EVT VT = Vec.getValueType();
5048 EVT ElVT = VT.getVectorElementType();
5049 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5050 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5051 VT.getVectorNumElements()/Factor);
5053 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5054 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5055 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5057 // This is the index of the first element of the vectorWidth-bit chunk
5058 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5059 IdxVal &= ~(ElemsPerChunk - 1);
5061 // If the input is a buildvector just emit a smaller one.
5062 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5063 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
5064 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5066 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5067 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5070 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5071 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5072 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5073 /// instructions or a simple subregister reference. Idx is an index in the
5074 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5075 /// lowering EXTRACT_VECTOR_ELT operations easier.
5076 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5077 SelectionDAG &DAG, const SDLoc &dl) {
5078 assert((Vec.getValueType().is256BitVector() ||
5079 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5080 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5083 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5084 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5085 SelectionDAG &DAG, const SDLoc &dl) {
5086 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5087 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5090 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5091 SelectionDAG &DAG, const SDLoc &dl,
5092 unsigned vectorWidth) {
5093 assert((vectorWidth == 128 || vectorWidth == 256) &&
5094 "Unsupported vector width");
5095 // Inserting UNDEF is Result
5098 EVT VT = Vec.getValueType();
5099 EVT ElVT = VT.getVectorElementType();
5100 EVT ResultVT = Result.getValueType();
5102 // Insert the relevant vectorWidth bits.
5103 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5104 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5106 // This is the index of the first element of the vectorWidth-bit chunk
5107 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5108 IdxVal &= ~(ElemsPerChunk - 1);
5110 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5111 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5114 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5115 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5116 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5117 /// simple superregister reference. Idx is an index in the 128 bits
5118 /// we want. It need not be aligned to a 128-bit boundary. That makes
5119 /// lowering INSERT_VECTOR_ELT operations easier.
5120 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5121 SelectionDAG &DAG, const SDLoc &dl) {
5122 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5123 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5126 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5127 SelectionDAG &DAG, const SDLoc &dl) {
5128 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5129 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5132 /// Insert i1-subvector to i1-vector.
5133 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5134 const X86Subtarget &Subtarget) {
5137 SDValue Vec = Op.getOperand(0);
5138 SDValue SubVec = Op.getOperand(1);
5139 SDValue Idx = Op.getOperand(2);
5141 if (!isa<ConstantSDNode>(Idx))
5144 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5145 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5148 MVT OpVT = Op.getSimpleValueType();
5149 MVT SubVecVT = SubVec.getSimpleValueType();
5150 unsigned NumElems = OpVT.getVectorNumElements();
5151 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5153 assert(IdxVal + SubVecNumElems <= NumElems &&
5154 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5155 "Unexpected index value in INSERT_SUBVECTOR");
5157 // There are 3 possible cases:
5158 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5159 // 2. Subvector should be inserted in the upper part
5160 // (IdxVal + SubVecNumElems == NumElems)
5161 // 3. Subvector should be inserted in the middle (for example v2i1
5162 // to v16i1, index 2)
5164 // extend to natively supported kshift
5165 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5166 MVT WideOpVT = OpVT;
5167 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5170 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5171 SDValue Undef = DAG.getUNDEF(WideOpVT);
5172 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5173 Undef, SubVec, ZeroIdx);
5175 // Extract sub-vector if require.
5176 auto ExtractSubVec = [&](SDValue V) {
5177 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5181 if (Vec.isUndef()) {
5183 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5184 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5187 return ExtractSubVec(WideSubVec);
5190 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5191 NumElems = WideOpVT.getVectorNumElements();
5192 unsigned ShiftLeft = NumElems - SubVecNumElems;
5193 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5194 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5195 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5196 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5197 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5198 return ExtractSubVec(Vec);
5202 // Zero lower bits of the Vec
5203 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5204 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5205 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5206 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5207 // Merge them together, SubVec should be zero extended.
5208 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5209 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5211 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5212 return ExtractSubVec(Vec);
5215 // Simple case when we put subvector in the upper part
5216 if (IdxVal + SubVecNumElems == NumElems) {
5217 // Zero upper bits of the Vec
5218 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5219 DAG.getConstant(IdxVal, dl, MVT::i8));
5220 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5221 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5222 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5223 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5224 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5225 return ExtractSubVec(Vec);
5227 // Subvector should be inserted in the middle - use shuffle
5228 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5230 SmallVector<int, 64> Mask;
5231 for (unsigned i = 0; i < NumElems; ++i)
5232 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5234 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5237 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5238 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5239 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5240 /// large BUILD_VECTORS.
5241 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5242 unsigned NumElems, SelectionDAG &DAG,
5244 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5245 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5248 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5249 unsigned NumElems, SelectionDAG &DAG,
5251 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5252 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5255 /// Returns a vector of specified type with all bits set.
5256 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5257 /// Then bitcast to their original type, ensuring they get CSE'd.
5258 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5259 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5260 "Expected a 128/256/512-bit vector type");
5262 APInt Ones = APInt::getAllOnesValue(32);
5263 unsigned NumElts = VT.getSizeInBits() / 32;
5264 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5265 return DAG.getBitcast(VT, Vec);
5268 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5269 SelectionDAG &DAG) {
5270 EVT InVT = In.getValueType();
5271 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5273 if (VT.is128BitVector() && InVT.is128BitVector())
5274 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5275 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5277 // For 256-bit vectors, we only need the lower (128-bit) input half.
5278 // For 512-bit vectors, we only need the lower input half or quarter.
5279 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5280 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5281 In = extractSubVector(In, 0, DAG, DL,
5282 std::max(128, (int)VT.getSizeInBits() / Scale));
5285 return DAG.getNode(Opc, DL, VT, In);
5288 /// Generate unpacklo/unpackhi shuffle mask.
5289 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5291 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5292 int NumElts = VT.getVectorNumElements();
5293 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5295 for (int i = 0; i < NumElts; ++i) {
5296 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5297 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5298 Pos += (Unary ? 0 : NumElts * (i % 2));
5299 Pos += (Lo ? 0 : NumEltsInLane / 2);
5300 Mask.push_back(Pos);
5304 /// Returns a vector_shuffle node for an unpackl operation.
5305 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5306 SDValue V1, SDValue V2) {
5307 SmallVector<int, 8> Mask;
5308 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5309 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5312 /// Returns a vector_shuffle node for an unpackh operation.
5313 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5314 SDValue V1, SDValue V2) {
5315 SmallVector<int, 8> Mask;
5316 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5317 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5320 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5321 /// This produces a shuffle where the low element of V2 is swizzled into the
5322 /// zero/undef vector, landing at element Idx.
5323 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5324 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5326 const X86Subtarget &Subtarget,
5327 SelectionDAG &DAG) {
5328 MVT VT = V2.getSimpleValueType();
5330 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5331 int NumElems = VT.getVectorNumElements();
5332 SmallVector<int, 16> MaskVec(NumElems);
5333 for (int i = 0; i != NumElems; ++i)
5334 // If this is the insertion idx, put the low elt of V2 here.
5335 MaskVec[i] = (i == Idx) ? NumElems : i;
5336 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5339 static SDValue peekThroughBitcasts(SDValue V) {
5340 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5341 V = V.getOperand(0);
5345 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5346 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5347 V.getOperand(0).hasOneUse())
5348 V = V.getOperand(0);
5352 static const Constant *getTargetConstantFromNode(SDValue Op) {
5353 Op = peekThroughBitcasts(Op);
5355 auto *Load = dyn_cast<LoadSDNode>(Op);
5359 SDValue Ptr = Load->getBasePtr();
5360 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5361 Ptr->getOpcode() == X86ISD::WrapperRIP)
5362 Ptr = Ptr->getOperand(0);
5364 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5365 if (!CNode || CNode->isMachineConstantPoolEntry())
5368 return dyn_cast<Constant>(CNode->getConstVal());
5371 // Extract raw constant bits from constant pools.
5372 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5374 SmallVectorImpl<APInt> &EltBits,
5375 bool AllowWholeUndefs = true,
5376 bool AllowPartialUndefs = true) {
5377 assert(EltBits.empty() && "Expected an empty EltBits vector");
5379 Op = peekThroughBitcasts(Op);
5381 EVT VT = Op.getValueType();
5382 unsigned SizeInBits = VT.getSizeInBits();
5383 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5384 unsigned NumElts = SizeInBits / EltSizeInBits;
5386 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5387 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5389 // Extract all the undef/constant element data and pack into single bitsets.
5390 APInt UndefBits(SizeInBits, 0);
5391 APInt MaskBits(SizeInBits, 0);
5393 // Split the undef/constant single bitset data into the target elements.
5394 auto SplitBitData = [&]() {
5395 // Don't split if we don't allow undef bits.
5396 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5397 if (UndefBits.getBoolValue() && !AllowUndefs)
5400 UndefElts = APInt(NumElts, 0);
5401 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5403 for (unsigned i = 0; i != NumElts; ++i) {
5404 unsigned BitOffset = i * EltSizeInBits;
5405 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5407 // Only treat an element as UNDEF if all bits are UNDEF.
5408 if (UndefEltBits.isAllOnesValue()) {
5409 if (!AllowWholeUndefs)
5411 UndefElts.setBit(i);
5415 // If only some bits are UNDEF then treat them as zero (or bail if not
5417 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5420 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5421 EltBits[i] = Bits.getZExtValue();
5426 // Collect constant bits and insert into mask/undef bit masks.
5427 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5428 unsigned BitOffset) {
5431 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5432 if (isa<UndefValue>(Cst)) {
5433 Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5436 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5437 Mask.insertBits(CInt->getValue(), BitOffset);
5440 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5441 Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5447 // Extract constant bits from build vector.
5448 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5449 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5450 const SDValue &Src = Op.getOperand(i);
5451 unsigned BitOffset = i * SrcEltSizeInBits;
5452 if (Src.isUndef()) {
5453 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5456 auto *Cst = cast<ConstantSDNode>(Src);
5457 APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5458 MaskBits.insertBits(Bits, BitOffset);
5460 return SplitBitData();
5463 // Extract constant bits from constant pool vector.
5464 if (auto *Cst = getTargetConstantFromNode(Op)) {
5465 Type *CstTy = Cst->getType();
5466 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5469 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5470 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5471 if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5472 i * CstEltSizeInBits))
5475 return SplitBitData();
5478 // Extract constant bits from a broadcasted constant pool scalar.
5479 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5480 EltSizeInBits <= SrcEltSizeInBits) {
5481 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5482 APInt Bits(SizeInBits, 0);
5483 APInt Undefs(SizeInBits, 0);
5484 if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5485 for (unsigned i = 0; i != NumSrcElts; ++i) {
5486 MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5487 UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5489 return SplitBitData();
5494 // Extract a rematerialized scalar constant insertion.
5495 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5496 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5497 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5498 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5499 MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5500 MaskBits = MaskBits.zext(SizeInBits);
5501 return SplitBitData();
5507 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5508 unsigned MaskEltSizeInBits,
5509 SmallVectorImpl<uint64_t> &RawMask) {
5511 SmallVector<APInt, 64> EltBits;
5513 // Extract the raw target constant bits.
5514 // FIXME: We currently don't support UNDEF bits or mask entries.
5515 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5516 EltBits, /* AllowWholeUndefs */ false,
5517 /* AllowPartialUndefs */ false))
5520 // Insert the extracted elements into the mask.
5521 for (APInt Elt : EltBits)
5522 RawMask.push_back(Elt.getZExtValue());
5527 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5528 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5529 /// operands in \p Ops, and returns true.
5530 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5531 /// IsUnary for shuffles which use a single input multiple times, and in those
5532 /// cases it will adjust the mask to only have indices within that single input.
5533 /// It is an error to call this with non-empty Mask/Ops vectors.
5534 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5535 SmallVectorImpl<SDValue> &Ops,
5536 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5537 unsigned NumElems = VT.getVectorNumElements();
5540 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5541 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5544 bool IsFakeUnary = false;
5545 switch(N->getOpcode()) {
5546 case X86ISD::BLENDI:
5547 ImmN = N->getOperand(N->getNumOperands()-1);
5548 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5549 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5552 ImmN = N->getOperand(N->getNumOperands()-1);
5553 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5554 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5556 case X86ISD::INSERTPS:
5557 ImmN = N->getOperand(N->getNumOperands()-1);
5558 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5559 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5561 case X86ISD::UNPCKH:
5562 DecodeUNPCKHMask(VT, Mask);
5563 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5565 case X86ISD::UNPCKL:
5566 DecodeUNPCKLMask(VT, Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5569 case X86ISD::MOVHLPS:
5570 DecodeMOVHLPSMask(NumElems, Mask);
5571 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5573 case X86ISD::MOVLHPS:
5574 DecodeMOVLHPSMask(NumElems, Mask);
5575 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5577 case X86ISD::PALIGNR:
5578 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5579 ImmN = N->getOperand(N->getNumOperands()-1);
5580 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5581 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5582 Ops.push_back(N->getOperand(1));
5583 Ops.push_back(N->getOperand(0));
5585 case X86ISD::VSHLDQ:
5586 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5587 ImmN = N->getOperand(N->getNumOperands() - 1);
5588 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5591 case X86ISD::VSRLDQ:
5592 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5593 ImmN = N->getOperand(N->getNumOperands() - 1);
5594 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5597 case X86ISD::PSHUFD:
5598 case X86ISD::VPERMILPI:
5599 ImmN = N->getOperand(N->getNumOperands()-1);
5600 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5603 case X86ISD::PSHUFHW:
5604 ImmN = N->getOperand(N->getNumOperands()-1);
5605 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5608 case X86ISD::PSHUFLW:
5609 ImmN = N->getOperand(N->getNumOperands()-1);
5610 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5613 case X86ISD::VZEXT_MOVL:
5614 DecodeZeroMoveLowMask(VT, Mask);
5617 case X86ISD::VBROADCAST: {
5618 SDValue N0 = N->getOperand(0);
5619 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5620 // add the pre-extracted value to the Ops vector.
5621 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5622 N0.getOperand(0).getValueType() == VT &&
5623 N0.getConstantOperandVal(1) == 0)
5624 Ops.push_back(N0.getOperand(0));
5626 // We only decode broadcasts of same-sized vectors, unless the broadcast
5627 // came from an extract from the original width. If we found one, we
5628 // pushed it the Ops vector above.
5629 if (N0.getValueType() == VT || !Ops.empty()) {
5630 DecodeVectorBroadcast(VT, Mask);
5636 case X86ISD::VPERMILPV: {
5638 SDValue MaskNode = N->getOperand(1);
5639 unsigned MaskEltSize = VT.getScalarSizeInBits();
5640 SmallVector<uint64_t, 32> RawMask;
5641 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5642 DecodeVPERMILPMask(VT, RawMask, Mask);
5645 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5646 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5651 case X86ISD::PSHUFB: {
5653 SDValue MaskNode = N->getOperand(1);
5654 SmallVector<uint64_t, 32> RawMask;
5655 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5656 DecodePSHUFBMask(RawMask, Mask);
5659 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5660 DecodePSHUFBMask(C, Mask);
5665 case X86ISD::VPERMI:
5666 ImmN = N->getOperand(N->getNumOperands()-1);
5667 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5672 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5674 case X86ISD::VPERM2X128:
5675 ImmN = N->getOperand(N->getNumOperands()-1);
5676 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5677 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5679 case X86ISD::MOVSLDUP:
5680 DecodeMOVSLDUPMask(VT, Mask);
5683 case X86ISD::MOVSHDUP:
5684 DecodeMOVSHDUPMask(VT, Mask);
5687 case X86ISD::MOVDDUP:
5688 DecodeMOVDDUPMask(VT, Mask);
5691 case X86ISD::MOVLHPD:
5692 case X86ISD::MOVLPD:
5693 case X86ISD::MOVLPS:
5694 // Not yet implemented
5696 case X86ISD::VPERMIL2: {
5697 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5698 unsigned MaskEltSize = VT.getScalarSizeInBits();
5699 SDValue MaskNode = N->getOperand(2);
5700 SDValue CtrlNode = N->getOperand(3);
5701 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5702 unsigned CtrlImm = CtrlOp->getZExtValue();
5703 SmallVector<uint64_t, 32> RawMask;
5704 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5705 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5708 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5709 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5715 case X86ISD::VPPERM: {
5716 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5717 SDValue MaskNode = N->getOperand(2);
5718 SmallVector<uint64_t, 32> RawMask;
5719 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5720 DecodeVPPERMMask(RawMask, Mask);
5723 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5724 DecodeVPPERMMask(C, Mask);
5729 case X86ISD::VPERMV: {
5731 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5732 Ops.push_back(N->getOperand(1));
5733 SDValue MaskNode = N->getOperand(0);
5734 SmallVector<uint64_t, 32> RawMask;
5735 unsigned MaskEltSize = VT.getScalarSizeInBits();
5736 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5737 DecodeVPERMVMask(RawMask, Mask);
5740 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5741 DecodeVPERMVMask(C, MaskEltSize, Mask);
5746 case X86ISD::VPERMV3: {
5747 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5748 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5749 Ops.push_back(N->getOperand(0));
5750 Ops.push_back(N->getOperand(2));
5751 SDValue MaskNode = N->getOperand(1);
5752 unsigned MaskEltSize = VT.getScalarSizeInBits();
5753 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5754 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5759 case X86ISD::VPERMIV3: {
5760 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5761 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5762 Ops.push_back(N->getOperand(1));
5763 Ops.push_back(N->getOperand(2));
5764 SDValue MaskNode = N->getOperand(0);
5765 unsigned MaskEltSize = VT.getScalarSizeInBits();
5766 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5767 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5772 default: llvm_unreachable("unknown target shuffle node");
5775 // Empty mask indicates the decode failed.
5779 // Check if we're getting a shuffle mask with zero'd elements.
5780 if (!AllowSentinelZero)
5781 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5784 // If we have a fake unary shuffle, the shuffle mask is spread across two
5785 // inputs that are actually the same node. Re-map the mask to always point
5786 // into the first input.
5789 if (M >= (int)Mask.size())
5792 // If we didn't already add operands in the opcode-specific code, default to
5793 // adding 1 or 2 operands starting at 0.
5795 Ops.push_back(N->getOperand(0));
5796 if (!IsUnary || IsFakeUnary)
5797 Ops.push_back(N->getOperand(1));
5803 /// Check a target shuffle mask's inputs to see if we can set any values to
5804 /// SM_SentinelZero - this is for elements that are known to be zero
5805 /// (not just zeroable) from their inputs.
5806 /// Returns true if the target shuffle mask was decoded.
5807 static bool setTargetShuffleZeroElements(SDValue N,
5808 SmallVectorImpl<int> &Mask,
5809 SmallVectorImpl<SDValue> &Ops) {
5811 if (!isTargetShuffle(N.getOpcode()))
5814 MVT VT = N.getSimpleValueType();
5815 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5818 SDValue V1 = Ops[0];
5819 SDValue V2 = IsUnary ? V1 : Ops[1];
5821 V1 = peekThroughBitcasts(V1);
5822 V2 = peekThroughBitcasts(V2);
5824 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5825 "Illegal split of shuffle value type");
5826 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5828 // Extract known constant input data.
5829 APInt UndefSrcElts[2];
5830 SmallVector<APInt, 32> SrcEltBits[2];
5831 bool IsSrcConstant[2] = {
5832 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5833 SrcEltBits[0], true, false),
5834 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5835 SrcEltBits[1], true, false)};
5837 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5840 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5844 // Determine shuffle input and normalize the mask.
5845 unsigned SrcIdx = M / Size;
5846 SDValue V = M < Size ? V1 : V2;
5849 // We are referencing an UNDEF input.
5851 Mask[i] = SM_SentinelUndef;
5855 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5856 // TODO: We currently only set UNDEF for integer types - floats use the same
5857 // registers as vectors and many of the scalar folded loads rely on the
5858 // SCALAR_TO_VECTOR pattern.
5859 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5860 (Size % V.getValueType().getVectorNumElements()) == 0) {
5861 int Scale = Size / V.getValueType().getVectorNumElements();
5862 int Idx = M / Scale;
5863 if (Idx != 0 && !VT.isFloatingPoint())
5864 Mask[i] = SM_SentinelUndef;
5865 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5866 Mask[i] = SM_SentinelZero;
5870 // Attempt to extract from the source's constant bits.
5871 if (IsSrcConstant[SrcIdx]) {
5872 if (UndefSrcElts[SrcIdx][M])
5873 Mask[i] = SM_SentinelUndef;
5874 else if (SrcEltBits[SrcIdx][M] == 0)
5875 Mask[i] = SM_SentinelZero;
5879 assert(VT.getVectorNumElements() == Mask.size() &&
5880 "Different mask size from vector size!");
5884 // Attempt to decode ops that could be represented as a shuffle mask.
5885 // The decoded shuffle mask may contain a different number of elements to the
5886 // destination value type.
5887 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5888 SmallVectorImpl<SDValue> &Ops) {
5892 MVT VT = N.getSimpleValueType();
5893 unsigned NumElts = VT.getVectorNumElements();
5894 unsigned NumSizeInBits = VT.getSizeInBits();
5895 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5896 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5897 "Expected byte aligned value types");
5899 unsigned Opcode = N.getOpcode();
5902 case X86ISD::ANDNP: {
5903 // Attempt to decode as a per-byte mask.
5905 SmallVector<APInt, 32> EltBits;
5906 SDValue N0 = N.getOperand(0);
5907 SDValue N1 = N.getOperand(1);
5908 bool IsAndN = (X86ISD::ANDNP == Opcode);
5909 uint64_t ZeroMask = IsAndN ? 255 : 0;
5910 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5912 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5914 Mask.push_back(SM_SentinelUndef);
5917 uint64_t ByteBits = EltBits[i].getZExtValue();
5918 if (ByteBits != 0 && ByteBits != 255)
5920 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5922 Ops.push_back(IsAndN ? N1 : N0);
5925 case ISD::SCALAR_TO_VECTOR: {
5926 // Match against a scalar_to_vector of an extract from a similar vector.
5927 SDValue N0 = N.getOperand(0);
5928 if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5929 N0.getOperand(0).getValueType() != VT ||
5930 !isa<ConstantSDNode>(N0.getOperand(1)) ||
5931 NumElts <= N0.getConstantOperandVal(1) ||
5932 !N->isOnlyUserOf(N0.getNode()))
5934 Ops.push_back(N0.getOperand(0));
5935 Mask.push_back(N0.getConstantOperandVal(1));
5936 Mask.append(NumElts - 1, SM_SentinelUndef);
5939 case X86ISD::PINSRB:
5940 case X86ISD::PINSRW: {
5941 SDValue InVec = N.getOperand(0);
5942 SDValue InScl = N.getOperand(1);
5943 uint64_t InIdx = N.getConstantOperandVal(2);
5944 assert(InIdx < NumElts && "Illegal insertion index");
5946 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5947 if (X86::isZeroNode(InScl)) {
5948 Ops.push_back(InVec);
5949 for (unsigned i = 0; i != NumElts; ++i)
5950 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5954 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5955 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5957 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5958 if (InScl.getOpcode() != ISD::AssertZext ||
5959 InScl.getOperand(0).getOpcode() != ExOp)
5962 SDValue ExVec = InScl.getOperand(0).getOperand(0);
5963 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5964 assert(ExIdx < NumElts && "Illegal extraction index");
5965 Ops.push_back(InVec);
5966 Ops.push_back(ExVec);
5967 for (unsigned i = 0; i != NumElts; ++i)
5968 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5972 case X86ISD::VSRLI: {
5973 uint64_t ShiftVal = N.getConstantOperandVal(1);
5974 // Out of range bit shifts are guaranteed to be zero.
5975 if (NumBitsPerElt <= ShiftVal) {
5976 Mask.append(NumElts, SM_SentinelZero);
5980 // We can only decode 'whole byte' bit shifts as shuffles.
5981 if ((ShiftVal % 8) != 0)
5984 uint64_t ByteShift = ShiftVal / 8;
5985 unsigned NumBytes = NumSizeInBits / 8;
5986 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5987 Ops.push_back(N.getOperand(0));
5989 // Clear mask to all zeros and insert the shifted byte indices.
5990 Mask.append(NumBytes, SM_SentinelZero);
5992 if (X86ISD::VSHLI == Opcode) {
5993 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5994 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5995 Mask[i + j] = i + j - ByteShift;
5997 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5998 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5999 Mask[i + j - ByteShift] = i + j;
6003 case ISD::ZERO_EXTEND_VECTOR_INREG:
6004 case X86ISD::VZEXT: {
6005 // TODO - add support for VPMOVZX with smaller input vector types.
6006 SDValue Src = N.getOperand(0);
6007 MVT SrcVT = Src.getSimpleValueType();
6008 if (NumSizeInBits != SrcVT.getSizeInBits())
6010 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6019 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6020 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6021 SmallVectorImpl<int> &Mask) {
6022 int MaskWidth = Mask.size();
6023 SmallVector<SDValue, 16> UsedInputs;
6024 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6025 int lo = UsedInputs.size() * MaskWidth;
6026 int hi = lo + MaskWidth;
6027 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6028 UsedInputs.push_back(Inputs[i]);
6035 Inputs = UsedInputs;
6038 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6039 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6040 /// remaining input indices in case we now have a unary shuffle and adjust the
6041 /// inputs accordingly.
6042 /// Returns true if the target shuffle mask was decoded.
6043 static bool resolveTargetShuffleInputs(SDValue Op,
6044 SmallVectorImpl<SDValue> &Inputs,
6045 SmallVectorImpl<int> &Mask) {
6046 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6047 if (!getFauxShuffleMask(Op, Mask, Inputs))
6050 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6054 /// Returns the scalar element that will make up the ith
6055 /// element of the result of the vector shuffle.
6056 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6059 return SDValue(); // Limit search depth.
6061 SDValue V = SDValue(N, 0);
6062 EVT VT = V.getValueType();
6063 unsigned Opcode = V.getOpcode();
6065 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6066 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6067 int Elt = SV->getMaskElt(Index);
6070 return DAG.getUNDEF(VT.getVectorElementType());
6072 unsigned NumElems = VT.getVectorNumElements();
6073 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6074 : SV->getOperand(1);
6075 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6078 // Recurse into target specific vector shuffles to find scalars.
6079 if (isTargetShuffle(Opcode)) {
6080 MVT ShufVT = V.getSimpleValueType();
6081 MVT ShufSVT = ShufVT.getVectorElementType();
6082 int NumElems = (int)ShufVT.getVectorNumElements();
6083 SmallVector<int, 16> ShuffleMask;
6084 SmallVector<SDValue, 16> ShuffleOps;
6087 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6090 int Elt = ShuffleMask[Index];
6091 if (Elt == SM_SentinelZero)
6092 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6093 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6094 if (Elt == SM_SentinelUndef)
6095 return DAG.getUNDEF(ShufSVT);
6097 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6098 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6099 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6103 // Actual nodes that may contain scalar elements
6104 if (Opcode == ISD::BITCAST) {
6105 V = V.getOperand(0);
6106 EVT SrcVT = V.getValueType();
6107 unsigned NumElems = VT.getVectorNumElements();
6109 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6113 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6114 return (Index == 0) ? V.getOperand(0)
6115 : DAG.getUNDEF(VT.getVectorElementType());
6117 if (V.getOpcode() == ISD::BUILD_VECTOR)
6118 return V.getOperand(Index);
6123 /// Custom lower build_vector of v16i8.
6124 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6125 unsigned NumNonZero, unsigned NumZero,
6127 const X86Subtarget &Subtarget) {
6135 // SSE4.1 - use PINSRB to insert each byte directly.
6136 if (Subtarget.hasSSE41()) {
6137 for (unsigned i = 0; i < 16; ++i) {
6138 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6140 // If the build vector contains zeros or our first insertion is not the
6141 // first index then insert into zero vector to break any register
6142 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6145 if (NumZero || 0 != i)
6146 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6148 assert(0 == i && "Expected insertion into zero-index");
6149 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6150 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6151 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6152 V = DAG.getBitcast(MVT::v16i8, V);
6156 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6157 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6164 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6165 for (unsigned i = 0; i < 16; ++i) {
6166 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6167 if (ThisIsNonZero && First) {
6169 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6171 V = DAG.getUNDEF(MVT::v8i16);
6176 // FIXME: Investigate extending to i32 instead of just i16.
6177 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6178 SDValue ThisElt, LastElt;
6179 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6180 if (LastIsNonZero) {
6182 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6184 if (ThisIsNonZero) {
6185 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6186 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6187 DAG.getConstant(8, dl, MVT::i8));
6189 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6195 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6196 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6197 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6198 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6199 V = DAG.getBitcast(MVT::v8i16, V);
6201 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6202 DAG.getIntPtrConstant(i / 2, dl));
6208 return DAG.getBitcast(MVT::v16i8, V);
6211 /// Custom lower build_vector of v8i16.
6212 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6213 unsigned NumNonZero, unsigned NumZero,
6215 const X86Subtarget &Subtarget) {
6222 for (unsigned i = 0; i < 8; ++i) {
6223 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6225 // If the build vector contains zeros or our first insertion is not the
6226 // first index then insert into zero vector to break any register
6227 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6230 if (NumZero || 0 != i)
6231 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6233 assert(0 == i && "Expected insertion into zero-index");
6234 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6235 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6236 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6237 V = DAG.getBitcast(MVT::v8i16, V);
6241 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6242 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6249 /// Custom lower build_vector of v4i32 or v4f32.
6250 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6251 const X86Subtarget &Subtarget) {
6252 // Find all zeroable elements.
6253 std::bitset<4> Zeroable;
6254 for (int i=0; i < 4; ++i) {
6255 SDValue Elt = Op->getOperand(i);
6256 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6258 assert(Zeroable.size() - Zeroable.count() > 1 &&
6259 "We expect at least two non-zero elements!");
6261 // We only know how to deal with build_vector nodes where elements are either
6262 // zeroable or extract_vector_elt with constant index.
6263 SDValue FirstNonZero;
6264 unsigned FirstNonZeroIdx;
6265 for (unsigned i=0; i < 4; ++i) {
6268 SDValue Elt = Op->getOperand(i);
6269 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6270 !isa<ConstantSDNode>(Elt.getOperand(1)))
6272 // Make sure that this node is extracting from a 128-bit vector.
6273 MVT VT = Elt.getOperand(0).getSimpleValueType();
6274 if (!VT.is128BitVector())
6276 if (!FirstNonZero.getNode()) {
6278 FirstNonZeroIdx = i;
6282 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6283 SDValue V1 = FirstNonZero.getOperand(0);
6284 MVT VT = V1.getSimpleValueType();
6286 // See if this build_vector can be lowered as a blend with zero.
6288 unsigned EltMaskIdx, EltIdx;
6290 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6291 if (Zeroable[EltIdx]) {
6292 // The zero vector will be on the right hand side.
6293 Mask[EltIdx] = EltIdx+4;
6297 Elt = Op->getOperand(EltIdx);
6298 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6299 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6300 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6302 Mask[EltIdx] = EltIdx;
6306 // Let the shuffle legalizer deal with blend operations.
6307 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6308 if (V1.getSimpleValueType() != VT)
6309 V1 = DAG.getBitcast(VT, V1);
6310 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6313 // See if we can lower this build_vector to a INSERTPS.
6314 if (!Subtarget.hasSSE41())
6317 SDValue V2 = Elt.getOperand(0);
6318 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6321 bool CanFold = true;
6322 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6326 SDValue Current = Op->getOperand(i);
6327 SDValue SrcVector = Current->getOperand(0);
6330 CanFold = SrcVector == V1 &&
6331 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6337 assert(V1.getNode() && "Expected at least two non-zero elements!");
6338 if (V1.getSimpleValueType() != MVT::v4f32)
6339 V1 = DAG.getBitcast(MVT::v4f32, V1);
6340 if (V2.getSimpleValueType() != MVT::v4f32)
6341 V2 = DAG.getBitcast(MVT::v4f32, V2);
6343 // Ok, we can emit an INSERTPS instruction.
6344 unsigned ZMask = Zeroable.to_ulong();
6346 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6347 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6349 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6350 DAG.getIntPtrConstant(InsertPSMask, DL));
6351 return DAG.getBitcast(VT, Result);
6354 /// Return a vector logical shift node.
6355 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6356 SelectionDAG &DAG, const TargetLowering &TLI,
6358 assert(VT.is128BitVector() && "Unknown type for VShift");
6359 MVT ShVT = MVT::v16i8;
6360 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6361 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6362 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6363 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6364 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6365 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6368 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6369 SelectionDAG &DAG) {
6371 // Check if the scalar load can be widened into a vector load. And if
6372 // the address is "base + cst" see if the cst can be "absorbed" into
6373 // the shuffle mask.
6374 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6375 SDValue Ptr = LD->getBasePtr();
6376 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6378 EVT PVT = LD->getValueType(0);
6379 if (PVT != MVT::i32 && PVT != MVT::f32)
6384 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6385 FI = FINode->getIndex();
6387 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6388 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6389 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6390 Offset = Ptr.getConstantOperandVal(1);
6391 Ptr = Ptr.getOperand(0);
6396 // FIXME: 256-bit vector instructions don't require a strict alignment,
6397 // improve this code to support it better.
6398 unsigned RequiredAlign = VT.getSizeInBits()/8;
6399 SDValue Chain = LD->getChain();
6400 // Make sure the stack object alignment is at least 16 or 32.
6401 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6402 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6403 if (MFI.isFixedObjectIndex(FI)) {
6404 // Can't change the alignment. FIXME: It's possible to compute
6405 // the exact stack offset and reference FI + adjust offset instead.
6406 // If someone *really* cares about this. That's the way to implement it.
6409 MFI.setObjectAlignment(FI, RequiredAlign);
6413 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6414 // Ptr + (Offset & ~15).
6417 if ((Offset % RequiredAlign) & 3)
6419 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6422 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6423 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6426 int EltNo = (Offset - StartOffset) >> 2;
6427 unsigned NumElems = VT.getVectorNumElements();
6429 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6430 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6431 LD->getPointerInfo().getWithOffset(StartOffset));
6433 SmallVector<int, 8> Mask(NumElems, EltNo);
6435 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6441 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6442 /// elements can be replaced by a single large load which has the same value as
6443 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6445 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6446 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6447 const SDLoc &DL, SelectionDAG &DAG,
6448 bool isAfterLegalize) {
6449 unsigned NumElems = Elts.size();
6451 int LastLoadedElt = -1;
6452 SmallBitVector LoadMask(NumElems, false);
6453 SmallBitVector ZeroMask(NumElems, false);
6454 SmallBitVector UndefMask(NumElems, false);
6456 // For each element in the initializer, see if we've found a load, zero or an
6458 for (unsigned i = 0; i < NumElems; ++i) {
6459 SDValue Elt = peekThroughBitcasts(Elts[i]);
6464 UndefMask[i] = true;
6465 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6467 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6470 // Each loaded element must be the correct fractional portion of the
6471 // requested vector load.
6472 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6477 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6478 "Incomplete element masks");
6480 // Handle Special Cases - all undef or undef/zero.
6481 if (UndefMask.count() == NumElems)
6482 return DAG.getUNDEF(VT);
6484 // FIXME: Should we return this as a BUILD_VECTOR instead?
6485 if ((ZeroMask | UndefMask).count() == NumElems)
6486 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6487 : DAG.getConstantFP(0.0, DL, VT);
6489 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6490 int FirstLoadedElt = LoadMask.find_first();
6491 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6492 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6493 EVT LDBaseVT = EltBase.getValueType();
6495 // Consecutive loads can contain UNDEFS but not ZERO elements.
6496 // Consecutive loads with UNDEFs and ZEROs elements require a
6497 // an additional shuffle stage to clear the ZERO elements.
6498 bool IsConsecutiveLoad = true;
6499 bool IsConsecutiveLoadWithZeros = true;
6500 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6502 SDValue Elt = peekThroughBitcasts(Elts[i]);
6503 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6504 if (!DAG.areNonVolatileConsecutiveLoads(
6505 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6506 i - FirstLoadedElt)) {
6507 IsConsecutiveLoad = false;
6508 IsConsecutiveLoadWithZeros = false;
6511 } else if (ZeroMask[i]) {
6512 IsConsecutiveLoad = false;
6516 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6517 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6518 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6519 "Cannot merge volatile loads.");
6521 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6522 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6524 if (LDBase->hasAnyUseOfValue(1)) {
6526 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6527 SDValue(NewLd.getNode(), 1));
6528 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6529 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6530 SDValue(NewLd.getNode(), 1));
6536 // LOAD - all consecutive load/undefs (must start/end with a load).
6537 // If we have found an entire vector of loads and undefs, then return a large
6538 // load of the entire vector width starting at the base pointer.
6539 // If the vector contains zeros, then attempt to shuffle those elements.
6540 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6541 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6542 assert(LDBase && "Did not find base load for merging consecutive loads");
6543 EVT EltVT = LDBase->getValueType(0);
6544 // Ensure that the input vector size for the merged loads matches the
6545 // cumulative size of the input elements.
6546 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6549 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6552 if (IsConsecutiveLoad)
6553 return CreateLoad(VT, LDBase);
6555 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6556 // vector and a zero vector to clear out the zero elements.
6557 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6558 SmallVector<int, 4> ClearMask(NumElems, -1);
6559 for (unsigned i = 0; i < NumElems; ++i) {
6561 ClearMask[i] = i + NumElems;
6562 else if (LoadMask[i])
6565 SDValue V = CreateLoad(VT, LDBase);
6566 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6567 : DAG.getConstantFP(0.0, DL, VT);
6568 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6573 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6575 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6576 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6577 (LoadSize == 32 || LoadSize == 64) &&
6578 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6579 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6580 : MVT::getIntegerVT(LoadSize);
6581 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6582 if (TLI.isTypeLegal(VecVT)) {
6583 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6584 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6586 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6587 LDBase->getPointerInfo(),
6588 LDBase->getAlignment(),
6589 false/*isVolatile*/, true/*ReadMem*/,
6592 // Make sure the newly-created LOAD is in the same position as LDBase in
6593 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6594 // and update uses of LDBase's output chain to use the TokenFactor.
6595 if (LDBase->hasAnyUseOfValue(1)) {
6597 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6598 SDValue(ResNode.getNode(), 1));
6599 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6600 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6601 SDValue(ResNode.getNode(), 1));
6604 return DAG.getBitcast(VT, ResNode);
6611 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6612 unsigned SplatBitSize, LLVMContext &C) {
6613 unsigned ScalarSize = VT.getScalarSizeInBits();
6614 unsigned NumElm = SplatBitSize / ScalarSize;
6616 SmallVector<Constant *, 32> ConstantVec;
6617 for (unsigned i = 0; i < NumElm; i++) {
6618 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6620 if (VT.isFloatingPoint()) {
6621 assert((ScalarSize == 32 || ScalarSize == 64) &&
6622 "Unsupported floating point scalar size");
6623 if (ScalarSize == 32)
6624 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6626 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6628 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6629 ConstantVec.push_back(Const);
6631 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6634 static bool isUseOfShuffle(SDNode *N) {
6635 for (auto *U : N->uses()) {
6636 if (isTargetShuffle(U->getOpcode()))
6638 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6639 return isUseOfShuffle(U);
6644 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6645 /// following cases:
6646 /// 1. A splat BUILD_VECTOR which uses:
6647 /// a. A single scalar load, or a constant.
6648 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6649 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6650 /// a scalar load, or a constant.
6652 /// The VBROADCAST node is returned when a pattern is found,
6653 /// or SDValue() otherwise.
6654 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6655 SelectionDAG &DAG) {
6656 // VBROADCAST requires AVX.
6657 // TODO: Splats could be generated for non-AVX CPUs using SSE
6658 // instructions, but there's less potential gain for only 128-bit vectors.
6659 if (!Subtarget.hasAVX())
6662 MVT VT = BVOp->getSimpleValueType(0);
6665 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6666 "Unsupported vector type for broadcast.");
6668 BitVector UndefElements;
6669 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6671 // We need a splat of a single value to use broadcast, and it doesn't
6672 // make any sense if the value is only in one element of the vector.
6673 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6674 APInt SplatValue, Undef;
6675 unsigned SplatBitSize;
6677 // Check if this is a repeated constant pattern suitable for broadcasting.
6678 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6679 SplatBitSize > VT.getScalarSizeInBits() &&
6680 SplatBitSize < VT.getSizeInBits()) {
6681 // Avoid replacing with broadcast when it's a use of a shuffle
6682 // instruction to preserve the present custom lowering of shuffles.
6683 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6685 // replace BUILD_VECTOR with broadcast of the repeated constants.
6686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6687 LLVMContext *Ctx = DAG.getContext();
6688 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6689 if (Subtarget.hasAVX()) {
6690 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6691 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6692 // Splatted value can fit in one INTEGER constant in constant pool.
6693 // Load the constant and broadcast it.
6694 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6695 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6696 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6697 SDValue CP = DAG.getConstantPool(C, PVT);
6698 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6700 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6702 CVT, dl, DAG.getEntryNode(), CP,
6703 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6705 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6706 MVT::getVectorVT(CVT, Repeat), Ld);
6707 return DAG.getBitcast(VT, Brdcst);
6708 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6709 // Splatted value can fit in one FLOAT constant in constant pool.
6710 // Load the constant and broadcast it.
6711 // AVX have support for 32 and 64 bit broadcast for floats only.
6712 // No 64bit integer in 32bit subtarget.
6713 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6714 Constant *C = SplatBitSize == 32
6715 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6716 SplatValue.bitsToFloat())
6717 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6718 SplatValue.bitsToDouble());
6719 SDValue CP = DAG.getConstantPool(C, PVT);
6720 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6722 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6724 CVT, dl, DAG.getEntryNode(), CP,
6725 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6727 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6728 MVT::getVectorVT(CVT, Repeat), Ld);
6729 return DAG.getBitcast(VT, Brdcst);
6730 } else if (SplatBitSize > 64) {
6731 // Load the vector of constants and broadcast it.
6732 MVT CVT = VT.getScalarType();
6733 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6735 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6736 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6737 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6739 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6740 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6742 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6743 return DAG.getBitcast(VT, Brdcst);
6750 bool ConstSplatVal =
6751 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6753 // Make sure that all of the users of a non-constant load are from the
6754 // BUILD_VECTOR node.
6755 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6758 unsigned ScalarSize = Ld.getValueSizeInBits();
6759 bool IsGE256 = (VT.getSizeInBits() >= 256);
6761 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6762 // instruction to save 8 or more bytes of constant pool data.
6763 // TODO: If multiple splats are generated to load the same constant,
6764 // it may be detrimental to overall size. There needs to be a way to detect
6765 // that condition to know if this is truly a size win.
6766 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6768 // Handle broadcasting a single constant scalar from the constant pool
6770 // On Sandybridge (no AVX2), it is still better to load a constant vector
6771 // from the constant pool and not to broadcast it from a scalar.
6772 // But override that restriction when optimizing for size.
6773 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6774 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6775 EVT CVT = Ld.getValueType();
6776 assert(!CVT.isVector() && "Must not broadcast a vector type");
6778 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6779 // For size optimization, also splat v2f64 and v2i64, and for size opt
6780 // with AVX2, also splat i8 and i16.
6781 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6782 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6783 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6784 const Constant *C = nullptr;
6785 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6786 C = CI->getConstantIntValue();
6787 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6788 C = CF->getConstantFPValue();
6790 assert(C && "Invalid constant type");
6792 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6794 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6795 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6797 CVT, dl, DAG.getEntryNode(), CP,
6798 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6801 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6805 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6807 // Handle AVX2 in-register broadcasts.
6808 if (!IsLoad && Subtarget.hasInt256() &&
6809 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6810 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6812 // The scalar source must be a normal load.
6816 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6817 (Subtarget.hasVLX() && ScalarSize == 64))
6818 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6820 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6821 // double since there is no vbroadcastsd xmm
6822 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6823 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6824 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6827 // Unsupported broadcast.
6831 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6832 /// underlying vector and index.
6834 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6836 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6838 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6839 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6842 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6844 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6846 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6847 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6850 // In this case the vector is the extract_subvector expression and the index
6851 // is 2, as specified by the shuffle.
6852 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6853 SDValue ShuffleVec = SVOp->getOperand(0);
6854 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6855 assert(ShuffleVecVT.getVectorElementType() ==
6856 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6858 int ShuffleIdx = SVOp->getMaskElt(Idx);
6859 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6860 ExtractedFromVec = ShuffleVec;
6866 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6867 MVT VT = Op.getSimpleValueType();
6869 // Skip if insert_vec_elt is not supported.
6870 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6871 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6875 unsigned NumElems = Op.getNumOperands();
6879 SmallVector<unsigned, 4> InsertIndices;
6880 SmallVector<int, 8> Mask(NumElems, -1);
6882 for (unsigned i = 0; i != NumElems; ++i) {
6883 unsigned Opc = Op.getOperand(i).getOpcode();
6885 if (Opc == ISD::UNDEF)
6888 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6889 // Quit if more than 1 elements need inserting.
6890 if (InsertIndices.size() > 1)
6893 InsertIndices.push_back(i);
6897 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6898 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6900 // Quit if non-constant index.
6901 if (!isa<ConstantSDNode>(ExtIdx))
6903 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6905 // Quit if extracted from vector of different type.
6906 if (ExtractedFromVec.getValueType() != VT)
6909 if (!VecIn1.getNode())
6910 VecIn1 = ExtractedFromVec;
6911 else if (VecIn1 != ExtractedFromVec) {
6912 if (!VecIn2.getNode())
6913 VecIn2 = ExtractedFromVec;
6914 else if (VecIn2 != ExtractedFromVec)
6915 // Quit if more than 2 vectors to shuffle
6919 if (ExtractedFromVec == VecIn1)
6921 else if (ExtractedFromVec == VecIn2)
6922 Mask[i] = Idx + NumElems;
6925 if (!VecIn1.getNode())
6928 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6929 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6931 for (unsigned Idx : InsertIndices)
6932 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6933 DAG.getIntPtrConstant(Idx, DL));
6938 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6939 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6940 Op.getScalarValueSizeInBits() == 1 &&
6941 "Can not convert non-constant vector");
6942 uint64_t Immediate = 0;
6943 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6944 SDValue In = Op.getOperand(idx);
6946 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6949 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6950 return DAG.getConstant(Immediate, dl, VT);
6952 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6954 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6956 MVT VT = Op.getSimpleValueType();
6957 assert((VT.getVectorElementType() == MVT::i1) &&
6958 "Unexpected type in LowerBUILD_VECTORvXi1!");
6961 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6962 return DAG.getTargetConstant(0, dl, VT);
6964 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6965 return DAG.getTargetConstant(1, dl, VT);
6967 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6968 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6969 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6970 return DAG.getBitcast(VT, Imm);
6971 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6972 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6973 DAG.getIntPtrConstant(0, dl));
6976 // Vector has one or more non-const elements
6977 uint64_t Immediate = 0;
6978 SmallVector<unsigned, 16> NonConstIdx;
6979 bool IsSplat = true;
6980 bool HasConstElts = false;
6982 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6983 SDValue In = Op.getOperand(idx);
6986 if (!isa<ConstantSDNode>(In))
6987 NonConstIdx.push_back(idx);
6989 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6990 HasConstElts = true;
6994 else if (In != Op.getOperand(SplatIdx))
6998 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7000 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
7001 DAG.getConstant(1, dl, VT),
7002 DAG.getConstant(0, dl, VT));
7004 // insert elements one by one
7008 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7009 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7011 else if (HasConstElts)
7012 Imm = DAG.getConstant(0, dl, VT);
7014 Imm = DAG.getUNDEF(VT);
7015 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7016 DstVec = DAG.getBitcast(VT, Imm);
7018 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7019 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7020 DAG.getIntPtrConstant(0, dl));
7023 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7024 unsigned InsertIdx = NonConstIdx[i];
7025 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7026 Op.getOperand(InsertIdx),
7027 DAG.getIntPtrConstant(InsertIdx, dl));
7032 /// \brief Return true if \p N implements a horizontal binop and return the
7033 /// operands for the horizontal binop into V0 and V1.
7035 /// This is a helper function of LowerToHorizontalOp().
7036 /// This function checks that the build_vector \p N in input implements a
7037 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7038 /// operation to match.
7039 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7040 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7041 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7044 /// This function only analyzes elements of \p N whose indices are
7045 /// in range [BaseIdx, LastIdx).
7046 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7048 unsigned BaseIdx, unsigned LastIdx,
7049 SDValue &V0, SDValue &V1) {
7050 EVT VT = N->getValueType(0);
7052 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7053 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7054 "Invalid Vector in input!");
7056 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7057 bool CanFold = true;
7058 unsigned ExpectedVExtractIdx = BaseIdx;
7059 unsigned NumElts = LastIdx - BaseIdx;
7060 V0 = DAG.getUNDEF(VT);
7061 V1 = DAG.getUNDEF(VT);
7063 // Check if N implements a horizontal binop.
7064 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7065 SDValue Op = N->getOperand(i + BaseIdx);
7068 if (Op->isUndef()) {
7069 // Update the expected vector extract index.
7070 if (i * 2 == NumElts)
7071 ExpectedVExtractIdx = BaseIdx;
7072 ExpectedVExtractIdx += 2;
7076 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7081 SDValue Op0 = Op.getOperand(0);
7082 SDValue Op1 = Op.getOperand(1);
7084 // Try to match the following pattern:
7085 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7086 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7087 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7088 Op0.getOperand(0) == Op1.getOperand(0) &&
7089 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7090 isa<ConstantSDNode>(Op1.getOperand(1)));
7094 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7095 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7097 if (i * 2 < NumElts) {
7099 V0 = Op0.getOperand(0);
7100 if (V0.getValueType() != VT)
7105 V1 = Op0.getOperand(0);
7106 if (V1.getValueType() != VT)
7109 if (i * 2 == NumElts)
7110 ExpectedVExtractIdx = BaseIdx;
7113 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7114 if (I0 == ExpectedVExtractIdx)
7115 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7116 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7117 // Try to match the following dag sequence:
7118 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7119 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7123 ExpectedVExtractIdx += 2;
7129 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7130 /// a concat_vector.
7132 /// This is a helper function of LowerToHorizontalOp().
7133 /// This function expects two 256-bit vectors called V0 and V1.
7134 /// At first, each vector is split into two separate 128-bit vectors.
7135 /// Then, the resulting 128-bit vectors are used to implement two
7136 /// horizontal binary operations.
7138 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7140 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7141 /// the two new horizontal binop.
7142 /// When Mode is set, the first horizontal binop dag node would take as input
7143 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7144 /// horizontal binop dag node would take as input the lower 128-bit of V1
7145 /// and the upper 128-bit of V1.
7147 /// HADD V0_LO, V0_HI
7148 /// HADD V1_LO, V1_HI
7150 /// Otherwise, the first horizontal binop dag node takes as input the lower
7151 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7152 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7154 /// HADD V0_LO, V1_LO
7155 /// HADD V0_HI, V1_HI
7157 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7158 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7159 /// the upper 128-bits of the result.
7160 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7161 const SDLoc &DL, SelectionDAG &DAG,
7162 unsigned X86Opcode, bool Mode,
7163 bool isUndefLO, bool isUndefHI) {
7164 MVT VT = V0.getSimpleValueType();
7165 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7166 "Invalid nodes in input!");
7168 unsigned NumElts = VT.getVectorNumElements();
7169 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7170 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7171 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7172 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7173 MVT NewVT = V0_LO.getSimpleValueType();
7175 SDValue LO = DAG.getUNDEF(NewVT);
7176 SDValue HI = DAG.getUNDEF(NewVT);
7179 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7180 if (!isUndefLO && !V0->isUndef())
7181 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7182 if (!isUndefHI && !V1->isUndef())
7183 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7185 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7186 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7187 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7189 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7190 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7193 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7196 /// Returns true iff \p BV builds a vector with the result equivalent to
7197 /// the result of ADDSUB operation.
7198 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7199 /// are written to the parameters \p Opnd0 and \p Opnd1.
7200 static bool isAddSub(const BuildVectorSDNode *BV,
7201 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7202 SDValue &Opnd0, SDValue &Opnd1) {
7204 MVT VT = BV->getSimpleValueType(0);
7205 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7206 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7207 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7210 unsigned NumElts = VT.getVectorNumElements();
7211 SDValue InVec0 = DAG.getUNDEF(VT);
7212 SDValue InVec1 = DAG.getUNDEF(VT);
7214 // Odd-numbered elements in the input build vector are obtained from
7215 // adding two integer/float elements.
7216 // Even-numbered elements in the input build vector are obtained from
7217 // subtracting two integer/float elements.
7218 unsigned ExpectedOpcode = ISD::FSUB;
7219 unsigned NextExpectedOpcode = ISD::FADD;
7220 bool AddFound = false;
7221 bool SubFound = false;
7223 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7224 SDValue Op = BV->getOperand(i);
7226 // Skip 'undef' values.
7227 unsigned Opcode = Op.getOpcode();
7228 if (Opcode == ISD::UNDEF) {
7229 std::swap(ExpectedOpcode, NextExpectedOpcode);
7233 // Early exit if we found an unexpected opcode.
7234 if (Opcode != ExpectedOpcode)
7237 SDValue Op0 = Op.getOperand(0);
7238 SDValue Op1 = Op.getOperand(1);
7240 // Try to match the following pattern:
7241 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7242 // Early exit if we cannot match that sequence.
7243 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7244 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7245 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7246 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7247 Op0.getOperand(1) != Op1.getOperand(1))
7250 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7254 // We found a valid add/sub node. Update the information accordingly.
7260 // Update InVec0 and InVec1.
7261 if (InVec0.isUndef()) {
7262 InVec0 = Op0.getOperand(0);
7263 if (InVec0.getSimpleValueType() != VT)
7266 if (InVec1.isUndef()) {
7267 InVec1 = Op1.getOperand(0);
7268 if (InVec1.getSimpleValueType() != VT)
7272 // Make sure that operands in input to each add/sub node always
7273 // come from a same pair of vectors.
7274 if (InVec0 != Op0.getOperand(0)) {
7275 if (ExpectedOpcode == ISD::FSUB)
7278 // FADD is commutable. Try to commute the operands
7279 // and then test again.
7280 std::swap(Op0, Op1);
7281 if (InVec0 != Op0.getOperand(0))
7285 if (InVec1 != Op1.getOperand(0))
7288 // Update the pair of expected opcodes.
7289 std::swap(ExpectedOpcode, NextExpectedOpcode);
7292 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7293 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7301 /// Returns true if is possible to fold MUL and an idiom that has already been
7302 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7303 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7304 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7306 /// Prior to calling this function it should be known that there is some
7307 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7308 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7309 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7310 /// of \p Opnd0 uses is expected to be equal to 2.
7311 /// For example, this function may be called for the following IR:
7312 /// %AB = fmul fast <2 x double> %A, %B
7313 /// %Sub = fsub fast <2 x double> %AB, %C
7314 /// %Add = fadd fast <2 x double> %AB, %C
7315 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7316 /// <2 x i32> <i32 0, i32 3>
7317 /// There is a def for %Addsub here, which potentially can be replaced by
7318 /// X86ISD::ADDSUB operation:
7319 /// %Addsub = X86ISD::ADDSUB %AB, %C
7320 /// and such ADDSUB can further be replaced with FMADDSUB:
7321 /// %Addsub = FMADDSUB %A, %B, %C.
7323 /// The main reason why this method is called before the replacement of the
7324 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7325 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7327 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7328 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7329 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7330 !Subtarget.hasAnyFMA())
7333 // FIXME: These checks must match the similar ones in
7334 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7335 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7336 // or MUL + ADDSUB to FMADDSUB.
7337 const TargetOptions &Options = DAG.getTarget().Options;
7339 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7344 Opnd1 = Opnd0.getOperand(1);
7345 Opnd0 = Opnd0.getOperand(0);
7350 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7351 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7352 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7353 const X86Subtarget &Subtarget,
7354 SelectionDAG &DAG) {
7355 SDValue Opnd0, Opnd1;
7356 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7359 MVT VT = BV->getSimpleValueType(0);
7362 // Try to generate X86ISD::FMADDSUB node here.
7364 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7365 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7367 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7368 // the ADDSUB idiom has been successfully recognized. There are no known
7369 // X86 targets with 512-bit ADDSUB instructions!
7370 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7372 if (VT.is512BitVector())
7375 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7378 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7379 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7380 const X86Subtarget &Subtarget,
7381 SelectionDAG &DAG) {
7382 MVT VT = BV->getSimpleValueType(0);
7383 unsigned NumElts = VT.getVectorNumElements();
7384 unsigned NumUndefsLO = 0;
7385 unsigned NumUndefsHI = 0;
7386 unsigned Half = NumElts/2;
7388 // Count the number of UNDEF operands in the build_vector in input.
7389 for (unsigned i = 0, e = Half; i != e; ++i)
7390 if (BV->getOperand(i)->isUndef())
7393 for (unsigned i = Half, e = NumElts; i != e; ++i)
7394 if (BV->getOperand(i)->isUndef())
7397 // Early exit if this is either a build_vector of all UNDEFs or all the
7398 // operands but one are UNDEF.
7399 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7403 SDValue InVec0, InVec1;
7404 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7405 // Try to match an SSE3 float HADD/HSUB.
7406 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7407 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7409 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7410 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7411 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7412 // Try to match an SSSE3 integer HADD/HSUB.
7413 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7414 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7416 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7417 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7420 if (!Subtarget.hasAVX())
7423 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7424 // Try to match an AVX horizontal add/sub of packed single/double
7425 // precision floating point values from 256-bit vectors.
7426 SDValue InVec2, InVec3;
7427 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7428 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7429 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7430 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7431 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7433 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7434 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7435 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7436 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7437 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7438 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7439 // Try to match an AVX2 horizontal add/sub of signed integers.
7440 SDValue InVec2, InVec3;
7442 bool CanFold = true;
7444 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7445 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7446 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7447 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7448 X86Opcode = X86ISD::HADD;
7449 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7450 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7451 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7452 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7453 X86Opcode = X86ISD::HSUB;
7458 // Fold this build_vector into a single horizontal add/sub.
7459 // Do this only if the target has AVX2.
7460 if (Subtarget.hasAVX2())
7461 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7463 // Do not try to expand this build_vector into a pair of horizontal
7464 // add/sub if we can emit a pair of scalar add/sub.
7465 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7468 // Convert this build_vector into a pair of horizontal binop followed by
7470 bool isUndefLO = NumUndefsLO == Half;
7471 bool isUndefHI = NumUndefsHI == Half;
7472 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7473 isUndefLO, isUndefHI);
7477 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7478 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7480 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7481 X86Opcode = X86ISD::HADD;
7482 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7483 X86Opcode = X86ISD::HSUB;
7484 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7485 X86Opcode = X86ISD::FHADD;
7486 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7487 X86Opcode = X86ISD::FHSUB;
7491 // Don't try to expand this build_vector into a pair of horizontal add/sub
7492 // if we can simply emit a pair of scalar add/sub.
7493 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7496 // Convert this build_vector into two horizontal add/sub followed by
7498 bool isUndefLO = NumUndefsLO == Half;
7499 bool isUndefHI = NumUndefsHI == Half;
7500 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7501 isUndefLO, isUndefHI);
7507 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7508 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7509 /// just apply the bit to the vectors.
7510 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7511 /// from this, but enough scalar bit operations are created from the later
7512 /// legalization + scalarization stages to need basic support.
7513 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7514 SelectionDAG &DAG) {
7516 MVT VT = Op->getSimpleValueType(0);
7517 unsigned NumElems = VT.getVectorNumElements();
7518 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7520 // Check that all elements have the same opcode.
7521 // TODO: Should we allow UNDEFS and if so how many?
7522 unsigned Opcode = Op->getOperand(0).getOpcode();
7523 for (unsigned i = 1; i < NumElems; ++i)
7524 if (Opcode != Op->getOperand(i).getOpcode())
7527 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7534 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7539 SmallVector<SDValue, 4> LHSElts, RHSElts;
7540 for (SDValue Elt : Op->ops()) {
7541 SDValue LHS = Elt.getOperand(0);
7542 SDValue RHS = Elt.getOperand(1);
7544 // We expect the canonicalized RHS operand to be the constant.
7545 if (!isa<ConstantSDNode>(RHS))
7547 LHSElts.push_back(LHS);
7548 RHSElts.push_back(RHS);
7551 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7552 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7553 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7556 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7557 /// functionality to do this, so it's all zeros, all ones, or some derivation
7558 /// that is cheap to calculate.
7559 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7560 const X86Subtarget &Subtarget) {
7562 MVT VT = Op.getSimpleValueType();
7564 // Vectors containing all zeros can be matched by pxor and xorps.
7565 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7566 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7567 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7568 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7571 return getZeroVector(VT, Subtarget, DAG, DL);
7574 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7575 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7576 // vpcmpeqd on 256-bit vectors.
7577 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7578 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7579 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7582 return getOnesVector(VT, DAG, DL);
7589 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7592 MVT VT = Op.getSimpleValueType();
7593 MVT ExtVT = VT.getVectorElementType();
7594 unsigned NumElems = Op.getNumOperands();
7596 // Generate vectors for predicate vectors.
7597 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7598 return LowerBUILD_VECTORvXi1(Op, DAG);
7600 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7601 return VectorConstant;
7603 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7604 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7606 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7607 return HorizontalOp;
7608 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7610 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7613 unsigned EVTBits = ExtVT.getSizeInBits();
7615 unsigned NumZero = 0;
7616 unsigned NumNonZero = 0;
7617 uint64_t NonZeros = 0;
7618 bool IsAllConstants = true;
7619 SmallSet<SDValue, 8> Values;
7620 for (unsigned i = 0; i < NumElems; ++i) {
7621 SDValue Elt = Op.getOperand(i);
7625 if (Elt.getOpcode() != ISD::Constant &&
7626 Elt.getOpcode() != ISD::ConstantFP)
7627 IsAllConstants = false;
7628 if (X86::isZeroNode(Elt))
7631 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7632 NonZeros |= ((uint64_t)1 << i);
7637 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7638 if (NumNonZero == 0)
7639 return DAG.getUNDEF(VT);
7641 // Special case for single non-zero, non-undef, element.
7642 if (NumNonZero == 1) {
7643 unsigned Idx = countTrailingZeros(NonZeros);
7644 SDValue Item = Op.getOperand(Idx);
7646 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7647 // the value are obviously zero, truncate the value to i32 and do the
7648 // insertion that way. Only do this if the value is non-constant or if the
7649 // value is a constant being inserted into element 0. It is cheaper to do
7650 // a constant pool load than it is to do a movd + shuffle.
7651 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7652 (!IsAllConstants || Idx == 0)) {
7653 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7655 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7656 MVT VecVT = MVT::v4i32;
7658 // Truncate the value (which may itself be a constant) to i32, and
7659 // convert it to a vector with movd (S2V+shuffle to zero extend).
7660 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7661 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7662 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7663 Item, Idx * 2, true, Subtarget, DAG));
7667 // If we have a constant or non-constant insertion into the low element of
7668 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7669 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7670 // depending on what the source datatype is.
7673 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7675 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7676 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7677 assert((VT.is128BitVector() || VT.is256BitVector() ||
7678 VT.is512BitVector()) &&
7679 "Expected an SSE value type!");
7680 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7681 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7682 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7685 // We can't directly insert an i8 or i16 into a vector, so zero extend
7687 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7688 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7689 if (VT.getSizeInBits() >= 256) {
7690 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7691 if (Subtarget.hasAVX()) {
7692 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7693 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7695 // Without AVX, we need to extend to a 128-bit vector and then
7696 // insert into the 256-bit vector.
7697 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7698 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7699 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7702 assert(VT.is128BitVector() && "Expected an SSE value type!");
7703 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7704 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7706 return DAG.getBitcast(VT, Item);
7710 // Is it a vector logical left shift?
7711 if (NumElems == 2 && Idx == 1 &&
7712 X86::isZeroNode(Op.getOperand(0)) &&
7713 !X86::isZeroNode(Op.getOperand(1))) {
7714 unsigned NumBits = VT.getSizeInBits();
7715 return getVShift(true, VT,
7716 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7717 VT, Op.getOperand(1)),
7718 NumBits/2, DAG, *this, dl);
7721 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7724 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7725 // is a non-constant being inserted into an element other than the low one,
7726 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7727 // movd/movss) to move this into the low element, then shuffle it into
7729 if (EVTBits == 32) {
7730 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7731 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7735 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7736 if (Values.size() == 1) {
7737 if (EVTBits == 32) {
7738 // Instead of a shuffle like this:
7739 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7740 // Check if it's possible to issue this instead.
7741 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7742 unsigned Idx = countTrailingZeros(NonZeros);
7743 SDValue Item = Op.getOperand(Idx);
7744 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7745 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7750 // A vector full of immediates; various special cases are already
7751 // handled, so this is best done with a single constant-pool load.
7755 // See if we can use a vector load to get all of the elements.
7756 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7757 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7758 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7762 // For AVX-length vectors, build the individual 128-bit pieces and use
7763 // shuffles to put them in place.
7764 if (VT.is256BitVector() || VT.is512BitVector()) {
7765 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7767 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7769 // Build both the lower and upper subvector.
7771 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7772 SDValue Upper = DAG.getBuildVector(
7773 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7775 // Recreate the wider vector with the lower and upper part.
7776 if (VT.is256BitVector())
7777 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7778 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7781 // Let legalizer expand 2-wide build_vectors.
7782 if (EVTBits == 64) {
7783 if (NumNonZero == 1) {
7784 // One half is zero or undef.
7785 unsigned Idx = countTrailingZeros(NonZeros);
7786 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7787 Op.getOperand(Idx));
7788 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7793 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7794 if (EVTBits == 8 && NumElems == 16)
7795 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7799 if (EVTBits == 16 && NumElems == 8)
7800 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7804 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7805 if (EVTBits == 32 && NumElems == 4)
7806 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7809 // If element VT is == 32 bits, turn it into a number of shuffles.
7810 if (NumElems == 4 && NumZero > 0) {
7811 SmallVector<SDValue, 8> Ops(NumElems);
7812 for (unsigned i = 0; i < 4; ++i) {
7813 bool isZero = !(NonZeros & (1ULL << i));
7815 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7817 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7820 for (unsigned i = 0; i < 2; ++i) {
7821 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7824 Ops[i] = Ops[i*2]; // Must be a zero vector.
7827 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7830 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7833 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7838 bool Reverse1 = (NonZeros & 0x3) == 2;
7839 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7843 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7844 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7846 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7849 if (Values.size() > 1 && VT.is128BitVector()) {
7850 // Check for a build vector from mostly shuffle plus few inserting.
7851 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7854 // For SSE 4.1, use insertps to put the high elements into the low element.
7855 if (Subtarget.hasSSE41()) {
7857 if (!Op.getOperand(0).isUndef())
7858 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7860 Result = DAG.getUNDEF(VT);
7862 for (unsigned i = 1; i < NumElems; ++i) {
7863 if (Op.getOperand(i).isUndef()) continue;
7864 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7865 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7870 // Otherwise, expand into a number of unpckl*, start by extending each of
7871 // our (non-undef) elements to the full vector width with the element in the
7872 // bottom slot of the vector (which generates no code for SSE).
7873 SmallVector<SDValue, 8> Ops(NumElems);
7874 for (unsigned i = 0; i < NumElems; ++i) {
7875 if (!Op.getOperand(i).isUndef())
7876 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7878 Ops[i] = DAG.getUNDEF(VT);
7881 // Next, we iteratively mix elements, e.g. for v4f32:
7882 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7883 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7884 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7885 unsigned EltStride = NumElems >> 1;
7886 while (EltStride != 0) {
7887 for (unsigned i = 0; i < EltStride; ++i) {
7888 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7889 // then it is safe to just drop this shuffle: V[i] is already in the
7890 // right place, the one element (since it's the first round) being
7891 // inserted as undef can be dropped. This isn't safe for successive
7892 // rounds because they will permute elements within both vectors.
7893 if (Ops[i+EltStride].isUndef() &&
7894 EltStride == NumElems/2)
7897 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7906 // 256-bit AVX can use the vinsertf128 instruction
7907 // to create 256-bit vectors from two other 128-bit ones.
7908 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7910 MVT ResVT = Op.getSimpleValueType();
7912 assert((ResVT.is256BitVector() ||
7913 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7915 SDValue V1 = Op.getOperand(0);
7916 SDValue V2 = Op.getOperand(1);
7917 unsigned NumElems = ResVT.getVectorNumElements();
7918 if (ResVT.is256BitVector())
7919 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7921 if (Op.getNumOperands() == 4) {
7922 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7923 ResVT.getVectorNumElements()/2);
7924 SDValue V3 = Op.getOperand(2);
7925 SDValue V4 = Op.getOperand(3);
7926 return concat256BitVectors(
7927 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7928 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7931 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7934 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7935 const X86Subtarget &Subtarget,
7936 SelectionDAG & DAG) {
7938 MVT ResVT = Op.getSimpleValueType();
7939 unsigned NumOfOperands = Op.getNumOperands();
7941 assert(isPowerOf2_32(NumOfOperands) &&
7942 "Unexpected number of operands in CONCAT_VECTORS");
7944 SDValue Undef = DAG.getUNDEF(ResVT);
7945 if (NumOfOperands > 2) {
7946 // Specialize the cases when all, or all but one, of the operands are undef.
7947 unsigned NumOfDefinedOps = 0;
7949 for (unsigned i = 0; i < NumOfOperands; i++)
7950 if (!Op.getOperand(i).isUndef()) {
7954 if (NumOfDefinedOps == 0)
7956 if (NumOfDefinedOps == 1) {
7957 unsigned SubVecNumElts =
7958 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7959 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7960 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7961 Op.getOperand(OpIdx), IdxVal);
7964 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7965 ResVT.getVectorNumElements()/2);
7966 SmallVector<SDValue, 2> Ops;
7967 for (unsigned i = 0; i < NumOfOperands/2; i++)
7968 Ops.push_back(Op.getOperand(i));
7969 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7971 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7972 Ops.push_back(Op.getOperand(i));
7973 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7974 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7978 SDValue V1 = Op.getOperand(0);
7979 SDValue V2 = Op.getOperand(1);
7980 unsigned NumElems = ResVT.getVectorNumElements();
7981 assert(V1.getValueType() == V2.getValueType() &&
7982 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7983 "Unexpected operands in CONCAT_VECTORS");
7985 if (ResVT.getSizeInBits() >= 16)
7986 return Op; // The operation is legal with KUNPCK
7988 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7989 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7990 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7991 if (IsZeroV1 && IsZeroV2)
7994 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7996 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7998 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8000 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8002 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8005 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8007 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8008 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8011 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8012 const X86Subtarget &Subtarget,
8013 SelectionDAG &DAG) {
8014 MVT VT = Op.getSimpleValueType();
8015 if (VT.getVectorElementType() == MVT::i1)
8016 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8018 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8019 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8020 Op.getNumOperands() == 4)));
8022 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8023 // from two other 128-bit ones.
8025 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8026 return LowerAVXCONCAT_VECTORS(Op, DAG);
8029 //===----------------------------------------------------------------------===//
8030 // Vector shuffle lowering
8032 // This is an experimental code path for lowering vector shuffles on x86. It is
8033 // designed to handle arbitrary vector shuffles and blends, gracefully
8034 // degrading performance as necessary. It works hard to recognize idiomatic
8035 // shuffles and lower them to optimal instruction patterns without leaving
8036 // a framework that allows reasonably efficient handling of all vector shuffle
8038 //===----------------------------------------------------------------------===//
8040 /// \brief Tiny helper function to identify a no-op mask.
8042 /// This is a somewhat boring predicate function. It checks whether the mask
8043 /// array input, which is assumed to be a single-input shuffle mask of the kind
8044 /// used by the X86 shuffle instructions (not a fully general
8045 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8046 /// in-place shuffle are 'no-op's.
8047 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8048 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8049 assert(Mask[i] >= -1 && "Out of bound mask element!");
8050 if (Mask[i] >= 0 && Mask[i] != i)
8056 /// \brief Test whether there are elements crossing 128-bit lanes in this
8059 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8060 /// and we routinely test for these.
8061 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8062 int LaneSize = 128 / VT.getScalarSizeInBits();
8063 int Size = Mask.size();
8064 for (int i = 0; i < Size; ++i)
8065 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8070 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8072 /// This checks a shuffle mask to see if it is performing the same
8073 /// lane-relative shuffle in each sub-lane. This trivially implies
8074 /// that it is also not lane-crossing. It may however involve a blend from the
8075 /// same lane of a second vector.
8077 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8078 /// non-trivial to compute in the face of undef lanes. The representation is
8079 /// suitable for use with existing 128-bit shuffles as entries from the second
8080 /// vector have been remapped to [LaneSize, 2*LaneSize).
8081 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8083 SmallVectorImpl<int> &RepeatedMask) {
8084 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8085 RepeatedMask.assign(LaneSize, -1);
8086 int Size = Mask.size();
8087 for (int i = 0; i < Size; ++i) {
8088 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8091 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8092 // This entry crosses lanes, so there is no way to model this shuffle.
8095 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8096 // Adjust second vector indices to start at LaneSize instead of Size.
8097 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8098 : Mask[i] % LaneSize + LaneSize;
8099 if (RepeatedMask[i % LaneSize] < 0)
8100 // This is the first non-undef entry in this slot of a 128-bit lane.
8101 RepeatedMask[i % LaneSize] = LocalM;
8102 else if (RepeatedMask[i % LaneSize] != LocalM)
8103 // Found a mismatch with the repeated mask.
8109 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8111 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8112 SmallVectorImpl<int> &RepeatedMask) {
8113 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8116 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8118 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8119 SmallVectorImpl<int> &RepeatedMask) {
8120 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8123 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8124 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8125 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8127 SmallVectorImpl<int> &RepeatedMask) {
8128 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8129 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8130 int Size = Mask.size();
8131 for (int i = 0; i < Size; ++i) {
8132 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8133 if (Mask[i] == SM_SentinelUndef)
8135 if (Mask[i] == SM_SentinelZero) {
8136 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8138 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8141 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8142 // This entry crosses lanes, so there is no way to model this shuffle.
8145 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8146 // Adjust second vector indices to start at LaneSize instead of Size.
8148 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8149 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8150 // This is the first non-undef entry in this slot of a 128-bit lane.
8151 RepeatedMask[i % LaneSize] = LocalM;
8152 else if (RepeatedMask[i % LaneSize] != LocalM)
8153 // Found a mismatch with the repeated mask.
8159 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8162 /// This is a fast way to test a shuffle mask against a fixed pattern:
8164 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8166 /// It returns true if the mask is exactly as wide as the argument list, and
8167 /// each element of the mask is either -1 (signifying undef) or the value given
8168 /// in the argument.
8169 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8170 ArrayRef<int> ExpectedMask) {
8171 if (Mask.size() != ExpectedMask.size())
8174 int Size = Mask.size();
8176 // If the values are build vectors, we can look through them to find
8177 // equivalent inputs that make the shuffles equivalent.
8178 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8179 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8181 for (int i = 0; i < Size; ++i) {
8182 assert(Mask[i] >= -1 && "Out of bound mask element!");
8183 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8184 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8185 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8186 if (!MaskBV || !ExpectedBV ||
8187 MaskBV->getOperand(Mask[i] % Size) !=
8188 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8196 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8198 /// The masks must be exactly the same width.
8200 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8201 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8203 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8204 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8205 ArrayRef<int> ExpectedMask) {
8206 int Size = Mask.size();
8207 if (Size != (int)ExpectedMask.size())
8210 for (int i = 0; i < Size; ++i)
8211 if (Mask[i] == SM_SentinelUndef)
8213 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8215 else if (Mask[i] != ExpectedMask[i])
8221 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8223 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8224 const APInt &Zeroable) {
8225 int NumElts = Mask.size();
8226 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8228 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8229 for (int i = 0; i != NumElts; ++i) {
8231 if (M == SM_SentinelUndef)
8233 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8234 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8239 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8241 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8242 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8245 SmallVector<int, 8> Unpcklwd;
8246 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8247 /* Unary = */ false);
8248 SmallVector<int, 8> Unpckhwd;
8249 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8250 /* Unary = */ false);
8251 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8252 isTargetShuffleEquivalent(Mask, Unpckhwd));
8253 return IsUnpackwdMask;
8256 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8258 /// This helper function produces an 8-bit shuffle immediate corresponding to
8259 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8260 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8263 /// NB: We rely heavily on "undef" masks preserving the input lane.
8264 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8265 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8266 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8267 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8268 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8269 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8272 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8273 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8274 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8275 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8279 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8280 SelectionDAG &DAG) {
8281 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8284 /// \brief Compute whether each element of a shuffle is zeroable.
8286 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8287 /// Either it is an undef element in the shuffle mask, the element of the input
8288 /// referenced is undef, or the element of the input referenced is known to be
8289 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8290 /// as many lanes with this technique as possible to simplify the remaining
8292 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8293 SDValue V1, SDValue V2) {
8294 APInt Zeroable(Mask.size(), 0);
8295 V1 = peekThroughBitcasts(V1);
8296 V2 = peekThroughBitcasts(V2);
8298 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8299 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8301 int VectorSizeInBits = V1.getValueSizeInBits();
8302 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8303 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8305 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8307 // Handle the easy cases.
8308 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8313 // Determine shuffle input and normalize the mask.
8314 SDValue V = M < Size ? V1 : V2;
8317 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8318 if (V.getOpcode() != ISD::BUILD_VECTOR)
8321 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8322 // the (larger) source element must be UNDEF/ZERO.
8323 if ((Size % V.getNumOperands()) == 0) {
8324 int Scale = Size / V->getNumOperands();
8325 SDValue Op = V.getOperand(M / Scale);
8326 if (Op.isUndef() || X86::isZeroNode(Op))
8328 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8329 APInt Val = Cst->getAPIntValue();
8330 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8331 Val = Val.getLoBits(ScalarSizeInBits);
8334 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8335 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8336 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8337 Val = Val.getLoBits(ScalarSizeInBits);
8344 // If the BUILD_VECTOR has more elements then all the (smaller) source
8345 // elements must be UNDEF or ZERO.
8346 if ((V.getNumOperands() % Size) == 0) {
8347 int Scale = V->getNumOperands() / Size;
8348 bool AllZeroable = true;
8349 for (int j = 0; j < Scale; ++j) {
8350 SDValue Op = V.getOperand((M * Scale) + j);
8351 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8362 // The Shuffle result is as follow:
8363 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8364 // Each Zeroable's element correspond to a particular Mask's element.
8365 // As described in computeZeroableShuffleElements function.
8367 // The function looks for a sub-mask that the nonzero elements are in
8368 // increasing order. If such sub-mask exist. The function returns true.
8369 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8370 ArrayRef<int> Mask, const EVT &VectorType,
8371 bool &IsZeroSideLeft) {
8372 int NextElement = -1;
8373 // Check if the Mask's nonzero elements are in increasing order.
8374 for (int i = 0, e = Mask.size(); i < e; i++) {
8375 // Checks if the mask's zeros elements are built from only zeros.
8376 assert(Mask[i] >= -1 && "Out of bound mask element!");
8381 // Find the lowest non zero element
8382 if (NextElement < 0) {
8383 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8384 IsZeroSideLeft = NextElement != 0;
8386 // Exit if the mask's non zero elements are not in increasing order.
8387 if (NextElement != Mask[i])
8394 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8395 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8396 ArrayRef<int> Mask, SDValue V1,
8398 const APInt &Zeroable,
8399 const X86Subtarget &Subtarget,
8400 SelectionDAG &DAG) {
8401 int Size = Mask.size();
8402 int LaneSize = 128 / VT.getScalarSizeInBits();
8403 const int NumBytes = VT.getSizeInBits() / 8;
8404 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8406 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8407 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8408 (Subtarget.hasBWI() && VT.is512BitVector()));
8410 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8411 // Sign bit set in i8 mask means zero element.
8412 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8415 for (int i = 0; i < NumBytes; ++i) {
8416 int M = Mask[i / NumEltBytes];
8418 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8421 if (Zeroable[i / NumEltBytes]) {
8422 PSHUFBMask[i] = ZeroMask;
8426 // We can only use a single input of V1 or V2.
8427 SDValue SrcV = (M >= Size ? V2 : V1);
8433 // PSHUFB can't cross lanes, ensure this doesn't happen.
8434 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8438 M = M * NumEltBytes + (i % NumEltBytes);
8439 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8441 assert(V && "Failed to find a source input");
8443 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8444 return DAG.getBitcast(
8445 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8446 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8449 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8450 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8453 // X86 has dedicated shuffle that can be lowered to VEXPAND
8454 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8455 const APInt &Zeroable,
8456 ArrayRef<int> Mask, SDValue &V1,
8457 SDValue &V2, SelectionDAG &DAG,
8458 const X86Subtarget &Subtarget) {
8459 bool IsLeftZeroSide = true;
8460 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8463 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8465 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8466 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8467 unsigned NumElts = VT.getVectorNumElements();
8468 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8469 "Unexpected number of vector elements");
8470 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8471 Subtarget, DAG, DL);
8472 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8473 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8474 return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
8475 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8479 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8480 unsigned &UnpackOpcode, bool IsUnary,
8481 ArrayRef<int> TargetMask, SDLoc &DL,
8483 const X86Subtarget &Subtarget) {
8484 int NumElts = VT.getVectorNumElements();
8486 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8487 for (int i = 0; i != NumElts; i += 2) {
8488 int M1 = TargetMask[i + 0];
8489 int M2 = TargetMask[i + 1];
8490 Undef1 &= (SM_SentinelUndef == M1);
8491 Undef2 &= (SM_SentinelUndef == M2);
8492 Zero1 &= isUndefOrZero(M1);
8493 Zero2 &= isUndefOrZero(M2);
8495 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8496 "Zeroable shuffle detected");
8498 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8499 SmallVector<int, 64> Unpckl, Unpckh;
8500 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8501 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8502 UnpackOpcode = X86ISD::UNPCKL;
8503 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8504 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8508 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8509 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8510 UnpackOpcode = X86ISD::UNPCKH;
8511 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8512 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8516 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8517 if (IsUnary && (Zero1 || Zero2)) {
8518 // Don't bother if we can blend instead.
8519 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8520 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8523 bool MatchLo = true, MatchHi = true;
8524 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8525 int M = TargetMask[i];
8527 // Ignore if the input is known to be zero or the index is undef.
8528 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8529 (M == SM_SentinelUndef))
8532 MatchLo &= (M == Unpckl[i]);
8533 MatchHi &= (M == Unpckh[i]);
8536 if (MatchLo || MatchHi) {
8537 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8538 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8539 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8544 // If a binary shuffle, commute and try again.
8546 ShuffleVectorSDNode::commuteMask(Unpckl);
8547 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8548 UnpackOpcode = X86ISD::UNPCKL;
8553 ShuffleVectorSDNode::commuteMask(Unpckh);
8554 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8555 UnpackOpcode = X86ISD::UNPCKH;
8564 // X86 has dedicated unpack instructions that can handle specific blend
8565 // operations: UNPCKH and UNPCKL.
8566 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8567 ArrayRef<int> Mask, SDValue V1,
8568 SDValue V2, SelectionDAG &DAG) {
8569 SmallVector<int, 8> Unpckl;
8570 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8571 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8572 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8574 SmallVector<int, 8> Unpckh;
8575 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8576 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8577 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8579 // Commute and try again.
8580 ShuffleVectorSDNode::commuteMask(Unpckl);
8581 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8582 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8584 ShuffleVectorSDNode::commuteMask(Unpckh);
8585 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8586 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8591 /// \brief Try to emit a bitmask instruction for a shuffle.
8593 /// This handles cases where we can model a blend exactly as a bitmask due to
8594 /// one of the inputs being zeroable.
8595 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8596 SDValue V2, ArrayRef<int> Mask,
8597 const APInt &Zeroable,
8598 SelectionDAG &DAG) {
8599 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8600 MVT EltVT = VT.getVectorElementType();
8601 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8602 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8603 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8605 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8608 if (Mask[i] % Size != i)
8609 return SDValue(); // Not a blend.
8611 V = Mask[i] < Size ? V1 : V2;
8612 else if (V != (Mask[i] < Size ? V1 : V2))
8613 return SDValue(); // Can only let one input through the mask.
8615 VMaskOps[i] = AllOnes;
8618 return SDValue(); // No non-zeroable elements!
8620 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8621 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8624 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8626 /// This is used as a fallback approach when first class blend instructions are
8627 /// unavailable. Currently it is only suitable for integer vectors, but could
8628 /// be generalized for floating point vectors if desirable.
8629 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8630 SDValue V2, ArrayRef<int> Mask,
8631 SelectionDAG &DAG) {
8632 assert(VT.isInteger() && "Only supports integer vector types!");
8633 MVT EltVT = VT.getVectorElementType();
8634 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8635 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8636 SmallVector<SDValue, 16> MaskOps;
8637 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8638 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8639 return SDValue(); // Shuffled input!
8640 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8643 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8644 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8645 // We have to cast V2 around.
8646 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8647 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8648 DAG.getBitcast(MaskVT, V1Mask),
8649 DAG.getBitcast(MaskVT, V2)));
8650 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8653 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8654 SDValue PreservedSrc,
8655 const X86Subtarget &Subtarget,
8658 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8659 MutableArrayRef<int> TargetMask,
8660 bool &ForceV1Zero, bool &ForceV2Zero,
8661 uint64_t &BlendMask) {
8662 bool V1IsZeroOrUndef =
8663 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8664 bool V2IsZeroOrUndef =
8665 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8668 ForceV1Zero = false, ForceV2Zero = false;
8669 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8671 // Attempt to generate the binary blend mask. If an input is zero then
8672 // we can use any lane.
8673 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8674 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8675 int M = TargetMask[i];
8676 if (M == SM_SentinelUndef)
8680 if (M == i + Size) {
8681 BlendMask |= 1ull << i;
8684 if (M == SM_SentinelZero) {
8685 if (V1IsZeroOrUndef) {
8690 if (V2IsZeroOrUndef) {
8692 BlendMask |= 1ull << i;
8693 TargetMask[i] = i + Size;
8702 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8703 uint64_t ScaledMask = 0;
8704 for (int i = 0; i != Size; ++i)
8705 if (BlendMask & (1ull << i))
8706 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8710 /// \brief Try to emit a blend instruction for a shuffle.
8712 /// This doesn't do any checks for the availability of instructions for blending
8713 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8714 /// be matched in the backend with the type given. What it does check for is
8715 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8716 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8717 SDValue V2, ArrayRef<int> Original,
8718 const APInt &Zeroable,
8719 const X86Subtarget &Subtarget,
8720 SelectionDAG &DAG) {
8721 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8723 uint64_t BlendMask = 0;
8724 bool ForceV1Zero = false, ForceV2Zero = false;
8725 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8729 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8731 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8733 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8735 switch (VT.SimpleTy) {
8740 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8741 DAG.getConstant(BlendMask, DL, MVT::i8));
8745 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8749 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8750 // that instruction.
8751 if (Subtarget.hasAVX2()) {
8752 // Scale the blend by the number of 32-bit dwords per element.
8753 int Scale = VT.getScalarSizeInBits() / 32;
8754 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8755 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8756 V1 = DAG.getBitcast(BlendVT, V1);
8757 V2 = DAG.getBitcast(BlendVT, V2);
8758 return DAG.getBitcast(
8759 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8760 DAG.getConstant(BlendMask, DL, MVT::i8)));
8764 // For integer shuffles we need to expand the mask and cast the inputs to
8765 // v8i16s prior to blending.
8766 int Scale = 8 / VT.getVectorNumElements();
8767 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8768 V1 = DAG.getBitcast(MVT::v8i16, V1);
8769 V2 = DAG.getBitcast(MVT::v8i16, V2);
8770 return DAG.getBitcast(VT,
8771 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8772 DAG.getConstant(BlendMask, DL, MVT::i8)));
8776 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8777 SmallVector<int, 8> RepeatedMask;
8778 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8779 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8780 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8782 for (int i = 0; i < 8; ++i)
8783 if (RepeatedMask[i] >= 8)
8784 BlendMask |= 1ull << i;
8785 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8786 DAG.getConstant(BlendMask, DL, MVT::i8));
8792 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8793 "256-bit byte-blends require AVX2 support!");
8795 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8797 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8798 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8799 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8802 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8803 if (SDValue Masked =
8804 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8807 // Scale the blend by the number of bytes per element.
8808 int Scale = VT.getScalarSizeInBits() / 8;
8810 // This form of blend is always done on bytes. Compute the byte vector
8812 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8814 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8815 // mix of LLVM's code generator and the x86 backend. We tell the code
8816 // generator that boolean values in the elements of an x86 vector register
8817 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8818 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8819 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8820 // of the element (the remaining are ignored) and 0 in that high bit would
8821 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8822 // the LLVM model for boolean values in vector elements gets the relevant
8823 // bit set, it is set backwards and over constrained relative to x86's
8825 SmallVector<SDValue, 32> VSELECTMask;
8826 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8827 for (int j = 0; j < Scale; ++j)
8828 VSELECTMask.push_back(
8829 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8830 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8833 V1 = DAG.getBitcast(BlendVT, V1);
8834 V2 = DAG.getBitcast(BlendVT, V2);
8835 return DAG.getBitcast(
8836 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8837 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8846 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8847 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8848 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8851 llvm_unreachable("Not a supported integer vector type!");
8855 /// \brief Try to lower as a blend of elements from two inputs followed by
8856 /// a single-input permutation.
8858 /// This matches the pattern where we can blend elements from two inputs and
8859 /// then reduce the shuffle to a single-input permutation.
8860 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8861 SDValue V1, SDValue V2,
8863 SelectionDAG &DAG) {
8864 // We build up the blend mask while checking whether a blend is a viable way
8865 // to reduce the shuffle.
8866 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8867 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8869 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8873 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8875 if (BlendMask[Mask[i] % Size] < 0)
8876 BlendMask[Mask[i] % Size] = Mask[i];
8877 else if (BlendMask[Mask[i] % Size] != Mask[i])
8878 return SDValue(); // Can't blend in the needed input!
8880 PermuteMask[i] = Mask[i] % Size;
8883 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8884 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8887 /// \brief Generic routine to decompose a shuffle and blend into independent
8888 /// blends and permutes.
8890 /// This matches the extremely common pattern for handling combined
8891 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8892 /// operations. It will try to pick the best arrangement of shuffles and
8894 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8898 SelectionDAG &DAG) {
8899 // Shuffle the input elements into the desired positions in V1 and V2 and
8900 // blend them together.
8901 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8902 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8903 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8904 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8905 if (Mask[i] >= 0 && Mask[i] < Size) {
8906 V1Mask[i] = Mask[i];
8908 } else if (Mask[i] >= Size) {
8909 V2Mask[i] = Mask[i] - Size;
8910 BlendMask[i] = i + Size;
8913 // Try to lower with the simpler initial blend strategy unless one of the
8914 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8915 // shuffle may be able to fold with a load or other benefit. However, when
8916 // we'll have to do 2x as many shuffles in order to achieve this, blending
8917 // first is a better strategy.
8918 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8919 if (SDValue BlendPerm =
8920 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8923 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8924 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8925 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8928 /// \brief Try to lower a vector shuffle as a rotation.
8930 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8931 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8932 ArrayRef<int> Mask) {
8933 int NumElts = Mask.size();
8935 // We need to detect various ways of spelling a rotation:
8936 // [11, 12, 13, 14, 15, 0, 1, 2]
8937 // [-1, 12, 13, 14, -1, -1, 1, -1]
8938 // [-1, -1, -1, -1, -1, -1, 1, 2]
8939 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8940 // [-1, 4, 5, 6, -1, -1, 9, -1]
8941 // [-1, 4, 5, 6, -1, -1, -1, -1]
8944 for (int i = 0; i < NumElts; ++i) {
8946 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8947 "Unexpected mask index.");
8951 // Determine where a rotated vector would have started.
8952 int StartIdx = i - (M % NumElts);
8954 // The identity rotation isn't interesting, stop.
8957 // If we found the tail of a vector the rotation must be the missing
8958 // front. If we found the head of a vector, it must be how much of the
8960 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8963 Rotation = CandidateRotation;
8964 else if (Rotation != CandidateRotation)
8965 // The rotations don't match, so we can't match this mask.
8968 // Compute which value this mask is pointing at.
8969 SDValue MaskV = M < NumElts ? V1 : V2;
8971 // Compute which of the two target values this index should be assigned
8972 // to. This reflects whether the high elements are remaining or the low
8973 // elements are remaining.
8974 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8976 // Either set up this value if we've not encountered it before, or check
8977 // that it remains consistent.
8980 else if (TargetV != MaskV)
8981 // This may be a rotation, but it pulls from the inputs in some
8982 // unsupported interleaving.
8986 // Check that we successfully analyzed the mask, and normalize the results.
8987 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8988 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9000 /// \brief Try to lower a vector shuffle as a byte rotation.
9002 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9003 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9004 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9005 /// try to generically lower a vector shuffle through such an pattern. It
9006 /// does not check for the profitability of lowering either as PALIGNR or
9007 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9008 /// This matches shuffle vectors that look like:
9010 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9012 /// Essentially it concatenates V1 and V2, shifts right by some number of
9013 /// elements, and takes the low elements as the result. Note that while this is
9014 /// specified as a *right shift* because x86 is little-endian, it is a *left
9015 /// rotate* of the vector lanes.
9016 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9017 ArrayRef<int> Mask) {
9018 // Don't accept any shuffles with zero elements.
9019 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9022 // PALIGNR works on 128-bit lanes.
9023 SmallVector<int, 16> RepeatedMask;
9024 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9027 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9031 // PALIGNR rotates bytes, so we need to scale the
9032 // rotation based on how many bytes are in the vector lane.
9033 int NumElts = RepeatedMask.size();
9034 int Scale = 16 / NumElts;
9035 return Rotation * Scale;
9038 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9039 SDValue V1, SDValue V2,
9041 const X86Subtarget &Subtarget,
9042 SelectionDAG &DAG) {
9043 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9045 SDValue Lo = V1, Hi = V2;
9046 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9047 if (ByteRotation <= 0)
9050 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9052 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9053 Lo = DAG.getBitcast(ByteVT, Lo);
9054 Hi = DAG.getBitcast(ByteVT, Hi);
9056 // SSSE3 targets can use the palignr instruction.
9057 if (Subtarget.hasSSSE3()) {
9058 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9059 "512-bit PALIGNR requires BWI instructions");
9060 return DAG.getBitcast(
9061 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9062 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9065 assert(VT.is128BitVector() &&
9066 "Rotate-based lowering only supports 128-bit lowering!");
9067 assert(Mask.size() <= 16 &&
9068 "Can shuffle at most 16 bytes in a 128-bit vector!");
9069 assert(ByteVT == MVT::v16i8 &&
9070 "SSE2 rotate lowering only needed for v16i8!");
9072 // Default SSE2 implementation
9073 int LoByteShift = 16 - ByteRotation;
9074 int HiByteShift = ByteRotation;
9076 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9077 DAG.getConstant(LoByteShift, DL, MVT::i8));
9078 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9079 DAG.getConstant(HiByteShift, DL, MVT::i8));
9080 return DAG.getBitcast(VT,
9081 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9084 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9086 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9087 /// rotation of the concatenation of two vectors; This routine will
9088 /// try to generically lower a vector shuffle through such an pattern.
9090 /// Essentially it concatenates V1 and V2, shifts right by some number of
9091 /// elements, and takes the low elements as the result. Note that while this is
9092 /// specified as a *right shift* because x86 is little-endian, it is a *left
9093 /// rotate* of the vector lanes.
9094 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9095 SDValue V1, SDValue V2,
9097 const X86Subtarget &Subtarget,
9098 SelectionDAG &DAG) {
9099 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9100 "Only 32-bit and 64-bit elements are supported!");
9102 // 128/256-bit vectors are only supported with VLX.
9103 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9104 && "VLX required for 128/256-bit vectors");
9106 SDValue Lo = V1, Hi = V2;
9107 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9111 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9112 DAG.getConstant(Rotation, DL, MVT::i8));
9115 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9117 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9118 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9119 /// matches elements from one of the input vectors shuffled to the left or
9120 /// right with zeroable elements 'shifted in'. It handles both the strictly
9121 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9124 /// PSHL : (little-endian) left bit shift.
9125 /// [ zz, 0, zz, 2 ]
9126 /// [ -1, 4, zz, -1 ]
9127 /// PSRL : (little-endian) right bit shift.
9129 /// [ -1, -1, 7, zz]
9130 /// PSLLDQ : (little-endian) left byte shift
9131 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9132 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9133 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9134 /// PSRLDQ : (little-endian) right byte shift
9135 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9136 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9137 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9138 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9139 unsigned ScalarSizeInBits,
9140 ArrayRef<int> Mask, int MaskOffset,
9141 const APInt &Zeroable,
9142 const X86Subtarget &Subtarget) {
9143 int Size = Mask.size();
9144 unsigned SizeInBits = Size * ScalarSizeInBits;
9146 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9147 for (int i = 0; i < Size; i += Scale)
9148 for (int j = 0; j < Shift; ++j)
9149 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9155 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9156 for (int i = 0; i != Size; i += Scale) {
9157 unsigned Pos = Left ? i + Shift : i;
9158 unsigned Low = Left ? i : i + Shift;
9159 unsigned Len = Scale - Shift;
9160 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9164 int ShiftEltBits = ScalarSizeInBits * Scale;
9165 bool ByteShift = ShiftEltBits > 64;
9166 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9167 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9168 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9170 // Normalize the scale for byte shifts to still produce an i64 element
9172 Scale = ByteShift ? Scale / 2 : Scale;
9174 // We need to round trip through the appropriate type for the shift.
9175 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9176 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9177 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9178 return (int)ShiftAmt;
9181 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9182 // keep doubling the size of the integer elements up to that. We can
9183 // then shift the elements of the integer vector by whole multiples of
9184 // their width within the elements of the larger integer vector. Test each
9185 // multiple to see if we can find a match with the moved element indices
9186 // and that the shifted in elements are all zeroable.
9187 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9188 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9189 for (int Shift = 1; Shift != Scale; ++Shift)
9190 for (bool Left : {true, false})
9191 if (CheckZeros(Shift, Scale, Left)) {
9192 int ShiftAmt = MatchShift(Shift, Scale, Left);
9201 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9202 SDValue V2, ArrayRef<int> Mask,
9203 const APInt &Zeroable,
9204 const X86Subtarget &Subtarget,
9205 SelectionDAG &DAG) {
9206 int Size = Mask.size();
9207 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9213 // Try to match shuffle against V1 shift.
9214 int ShiftAmt = matchVectorShuffleAsShift(
9215 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9217 // If V1 failed, try to match shuffle against V2 shift.
9220 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9221 Mask, Size, Zeroable, Subtarget);
9228 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9229 "Illegal integer vector type");
9230 V = DAG.getBitcast(ShiftVT, V);
9231 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9232 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9233 return DAG.getBitcast(VT, V);
9236 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9237 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9238 SDValue V2, ArrayRef<int> Mask,
9239 const APInt &Zeroable,
9240 SelectionDAG &DAG) {
9241 int Size = Mask.size();
9242 int HalfSize = Size / 2;
9243 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9244 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9246 // Upper half must be undefined.
9247 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9250 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9251 // Remainder of lower half result is zero and upper half is all undef.
9252 auto LowerAsEXTRQ = [&]() {
9253 // Determine the extraction length from the part of the
9254 // lower half that isn't zeroable.
9256 for (; Len > 0; --Len)
9257 if (!Zeroable[Len - 1])
9259 assert(Len > 0 && "Zeroable shuffle mask");
9261 // Attempt to match first Len sequential elements from the lower half.
9264 for (int i = 0; i != Len; ++i) {
9268 SDValue &V = (M < Size ? V1 : V2);
9271 // The extracted elements must start at a valid index and all mask
9272 // elements must be in the lower half.
9273 if (i > M || M >= HalfSize)
9276 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9287 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9288 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9289 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9290 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9291 DAG.getConstant(BitLen, DL, MVT::i8),
9292 DAG.getConstant(BitIdx, DL, MVT::i8));
9295 if (SDValue ExtrQ = LowerAsEXTRQ())
9298 // INSERTQ: Extract lowest Len elements from lower half of second source and
9299 // insert over first source, starting at Idx.
9300 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9301 auto LowerAsInsertQ = [&]() {
9302 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9305 // Attempt to match first source from mask before insertion point.
9306 if (isUndefInRange(Mask, 0, Idx)) {
9308 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9310 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9316 // Extend the extraction length looking to match both the insertion of
9317 // the second source and the remaining elements of the first.
9318 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9323 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9325 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9331 // Match the remaining elements of the lower half.
9332 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9334 } else if ((!Base || (Base == V1)) &&
9335 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9337 } else if ((!Base || (Base == V2)) &&
9338 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9345 // We may not have a base (first source) - this can safely be undefined.
9347 Base = DAG.getUNDEF(VT);
9349 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9350 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9351 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9352 DAG.getConstant(BitLen, DL, MVT::i8),
9353 DAG.getConstant(BitIdx, DL, MVT::i8));
9360 if (SDValue InsertQ = LowerAsInsertQ())
9366 /// \brief Lower a vector shuffle as a zero or any extension.
9368 /// Given a specific number of elements, element bit width, and extension
9369 /// stride, produce either a zero or any extension based on the available
9370 /// features of the subtarget. The extended elements are consecutive and
9371 /// begin and can start from an offsetted element index in the input; to
9372 /// avoid excess shuffling the offset must either being in the bottom lane
9373 /// or at the start of a higher lane. All extended elements must be from
9375 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9376 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9377 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9378 assert(Scale > 1 && "Need a scale to extend.");
9379 int EltBits = VT.getScalarSizeInBits();
9380 int NumElements = VT.getVectorNumElements();
9381 int NumEltsPerLane = 128 / EltBits;
9382 int OffsetLane = Offset / NumEltsPerLane;
9383 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9384 "Only 8, 16, and 32 bit elements can be extended.");
9385 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9386 assert(0 <= Offset && "Extension offset must be positive.");
9387 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9388 "Extension offset must be in the first lane or start an upper lane.");
9390 // Check that an index is in same lane as the base offset.
9391 auto SafeOffset = [&](int Idx) {
9392 return OffsetLane == (Idx / NumEltsPerLane);
9395 // Shift along an input so that the offset base moves to the first element.
9396 auto ShuffleOffset = [&](SDValue V) {
9400 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9401 for (int i = 0; i * Scale < NumElements; ++i) {
9402 int SrcIdx = i + Offset;
9403 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9405 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9408 // Found a valid zext mask! Try various lowering strategies based on the
9409 // input type and available ISA extensions.
9410 if (Subtarget.hasSSE41()) {
9411 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9412 // PUNPCK will catch this in a later shuffle match.
9413 if (Offset && Scale == 2 && VT.is128BitVector())
9415 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9416 NumElements / Scale);
9417 InputV = ShuffleOffset(InputV);
9418 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9419 return DAG.getBitcast(VT, InputV);
9422 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9424 // For any extends we can cheat for larger element sizes and use shuffle
9425 // instructions that can fold with a load and/or copy.
9426 if (AnyExt && EltBits == 32) {
9427 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9429 return DAG.getBitcast(
9430 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9431 DAG.getBitcast(MVT::v4i32, InputV),
9432 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9434 if (AnyExt && EltBits == 16 && Scale > 2) {
9435 int PSHUFDMask[4] = {Offset / 2, -1,
9436 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9437 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9438 DAG.getBitcast(MVT::v4i32, InputV),
9439 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9440 int PSHUFWMask[4] = {1, -1, -1, -1};
9441 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9442 return DAG.getBitcast(
9443 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9444 DAG.getBitcast(MVT::v8i16, InputV),
9445 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9448 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9450 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9451 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9452 assert(VT.is128BitVector() && "Unexpected vector width!");
9454 int LoIdx = Offset * EltBits;
9455 SDValue Lo = DAG.getBitcast(
9456 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9457 DAG.getConstant(EltBits, DL, MVT::i8),
9458 DAG.getConstant(LoIdx, DL, MVT::i8)));
9460 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9461 !SafeOffset(Offset + 1))
9462 return DAG.getBitcast(VT, Lo);
9464 int HiIdx = (Offset + 1) * EltBits;
9465 SDValue Hi = DAG.getBitcast(
9466 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9467 DAG.getConstant(EltBits, DL, MVT::i8),
9468 DAG.getConstant(HiIdx, DL, MVT::i8)));
9469 return DAG.getBitcast(VT,
9470 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9473 // If this would require more than 2 unpack instructions to expand, use
9474 // pshufb when available. We can only use more than 2 unpack instructions
9475 // when zero extending i8 elements which also makes it easier to use pshufb.
9476 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9477 assert(NumElements == 16 && "Unexpected byte vector width!");
9478 SDValue PSHUFBMask[16];
9479 for (int i = 0; i < 16; ++i) {
9480 int Idx = Offset + (i / Scale);
9481 PSHUFBMask[i] = DAG.getConstant(
9482 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9484 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9485 return DAG.getBitcast(
9486 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9487 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9490 // If we are extending from an offset, ensure we start on a boundary that
9491 // we can unpack from.
9492 int AlignToUnpack = Offset % (NumElements / Scale);
9493 if (AlignToUnpack) {
9494 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9495 for (int i = AlignToUnpack; i < NumElements; ++i)
9496 ShMask[i - AlignToUnpack] = i;
9497 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9498 Offset -= AlignToUnpack;
9501 // Otherwise emit a sequence of unpacks.
9503 unsigned UnpackLoHi = X86ISD::UNPCKL;
9504 if (Offset >= (NumElements / 2)) {
9505 UnpackLoHi = X86ISD::UNPCKH;
9506 Offset -= (NumElements / 2);
9509 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9510 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9511 : getZeroVector(InputVT, Subtarget, DAG, DL);
9512 InputV = DAG.getBitcast(InputVT, InputV);
9513 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9517 } while (Scale > 1);
9518 return DAG.getBitcast(VT, InputV);
9521 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9523 /// This routine will try to do everything in its power to cleverly lower
9524 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9525 /// check for the profitability of this lowering, it tries to aggressively
9526 /// match this pattern. It will use all of the micro-architectural details it
9527 /// can to emit an efficient lowering. It handles both blends with all-zero
9528 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9529 /// masking out later).
9531 /// The reason we have dedicated lowering for zext-style shuffles is that they
9532 /// are both incredibly common and often quite performance sensitive.
9533 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9534 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9535 const APInt &Zeroable, const X86Subtarget &Subtarget,
9536 SelectionDAG &DAG) {
9537 int Bits = VT.getSizeInBits();
9538 int NumLanes = Bits / 128;
9539 int NumElements = VT.getVectorNumElements();
9540 int NumEltsPerLane = NumElements / NumLanes;
9541 assert(VT.getScalarSizeInBits() <= 32 &&
9542 "Exceeds 32-bit integer zero extension limit");
9543 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9545 // Define a helper function to check a particular ext-scale and lower to it if
9547 auto Lower = [&](int Scale) -> SDValue {
9552 for (int i = 0; i < NumElements; ++i) {
9555 continue; // Valid anywhere but doesn't tell us anything.
9556 if (i % Scale != 0) {
9557 // Each of the extended elements need to be zeroable.
9561 // We no longer are in the anyext case.
9566 // Each of the base elements needs to be consecutive indices into the
9567 // same input vector.
9568 SDValue V = M < NumElements ? V1 : V2;
9569 M = M % NumElements;
9572 Offset = M - (i / Scale);
9573 } else if (InputV != V)
9574 return SDValue(); // Flip-flopping inputs.
9576 // Offset must start in the lowest 128-bit lane or at the start of an
9578 // FIXME: Is it ever worth allowing a negative base offset?
9579 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9580 (Offset % NumEltsPerLane) == 0))
9583 // If we are offsetting, all referenced entries must come from the same
9585 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9588 if ((M % NumElements) != (Offset + (i / Scale)))
9589 return SDValue(); // Non-consecutive strided elements.
9593 // If we fail to find an input, we have a zero-shuffle which should always
9594 // have already been handled.
9595 // FIXME: Maybe handle this here in case during blending we end up with one?
9599 // If we are offsetting, don't extend if we only match a single input, we
9600 // can always do better by using a basic PSHUF or PUNPCK.
9601 if (Offset != 0 && Matches < 2)
9604 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9605 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9608 // The widest scale possible for extending is to a 64-bit integer.
9609 assert(Bits % 64 == 0 &&
9610 "The number of bits in a vector must be divisible by 64 on x86!");
9611 int NumExtElements = Bits / 64;
9613 // Each iteration, try extending the elements half as much, but into twice as
9615 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9616 assert(NumElements % NumExtElements == 0 &&
9617 "The input vector size must be divisible by the extended size.");
9618 if (SDValue V = Lower(NumElements / NumExtElements))
9622 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9626 // Returns one of the source operands if the shuffle can be reduced to a
9627 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9628 auto CanZExtLowHalf = [&]() {
9629 for (int i = NumElements / 2; i != NumElements; ++i)
9632 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9634 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9639 if (SDValue V = CanZExtLowHalf()) {
9640 V = DAG.getBitcast(MVT::v2i64, V);
9641 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9642 return DAG.getBitcast(VT, V);
9645 // No viable ext lowering found.
9649 /// \brief Try to get a scalar value for a specific element of a vector.
9651 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9652 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9653 SelectionDAG &DAG) {
9654 MVT VT = V.getSimpleValueType();
9655 MVT EltVT = VT.getVectorElementType();
9656 V = peekThroughBitcasts(V);
9658 // If the bitcasts shift the element size, we can't extract an equivalent
9660 MVT NewVT = V.getSimpleValueType();
9661 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9664 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9665 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9666 // Ensure the scalar operand is the same size as the destination.
9667 // FIXME: Add support for scalar truncation where possible.
9668 SDValue S = V.getOperand(Idx);
9669 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9670 return DAG.getBitcast(EltVT, S);
9676 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9678 /// This is particularly important because the set of instructions varies
9679 /// significantly based on whether the operand is a load or not.
9680 static bool isShuffleFoldableLoad(SDValue V) {
9681 V = peekThroughBitcasts(V);
9682 return ISD::isNON_EXTLoad(V.getNode());
9685 /// \brief Try to lower insertion of a single element into a zero vector.
9687 /// This is a common pattern that we have especially efficient patterns to lower
9688 /// across all subtarget feature sets.
9689 static SDValue lowerVectorShuffleAsElementInsertion(
9690 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9691 const APInt &Zeroable, const X86Subtarget &Subtarget,
9692 SelectionDAG &DAG) {
9694 MVT EltVT = VT.getVectorElementType();
9697 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9699 bool IsV1Zeroable = true;
9700 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9701 if (i != V2Index && !Zeroable[i]) {
9702 IsV1Zeroable = false;
9706 // Check for a single input from a SCALAR_TO_VECTOR node.
9707 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9708 // all the smarts here sunk into that routine. However, the current
9709 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9710 // vector shuffle lowering is dead.
9711 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9713 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9714 // We need to zext the scalar if it is smaller than an i32.
9715 V2S = DAG.getBitcast(EltVT, V2S);
9716 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9717 // Using zext to expand a narrow element won't work for non-zero
9722 // Zero-extend directly to i32.
9724 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9726 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9727 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9728 EltVT == MVT::i16) {
9729 // Either not inserting from the low element of the input or the input
9730 // element size is too small to use VZEXT_MOVL to clear the high bits.
9734 if (!IsV1Zeroable) {
9735 // If V1 can't be treated as a zero vector we have fewer options to lower
9736 // this. We can't support integer vectors or non-zero targets cheaply, and
9737 // the V1 elements can't be permuted in any way.
9738 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9739 if (!VT.isFloatingPoint() || V2Index != 0)
9741 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9742 V1Mask[V2Index] = -1;
9743 if (!isNoopShuffleMask(V1Mask))
9745 // This is essentially a special case blend operation, but if we have
9746 // general purpose blend operations, they are always faster. Bail and let
9747 // the rest of the lowering handle these as blends.
9748 if (Subtarget.hasSSE41())
9751 // Otherwise, use MOVSD or MOVSS.
9752 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9753 "Only two types of floating point element types to handle!");
9754 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9758 // This lowering only works for the low element with floating point vectors.
9759 if (VT.isFloatingPoint() && V2Index != 0)
9762 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9764 V2 = DAG.getBitcast(VT, V2);
9767 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9768 // the desired position. Otherwise it is more efficient to do a vector
9769 // shift left. We know that we can do a vector shift left because all
9770 // the inputs are zero.
9771 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9772 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9773 V2Shuffle[V2Index] = 0;
9774 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9776 V2 = DAG.getBitcast(MVT::v16i8, V2);
9778 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9779 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9780 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9781 DAG.getDataLayout(), VT)));
9782 V2 = DAG.getBitcast(VT, V2);
9788 /// Try to lower broadcast of a single - truncated - integer element,
9789 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9791 /// This assumes we have AVX2.
9792 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9793 SDValue V0, int BroadcastIdx,
9794 const X86Subtarget &Subtarget,
9795 SelectionDAG &DAG) {
9796 assert(Subtarget.hasAVX2() &&
9797 "We can only lower integer broadcasts with AVX2!");
9799 EVT EltVT = VT.getVectorElementType();
9800 EVT V0VT = V0.getValueType();
9802 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9803 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9805 EVT V0EltVT = V0VT.getVectorElementType();
9806 if (!V0EltVT.isInteger())
9809 const unsigned EltSize = EltVT.getSizeInBits();
9810 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9812 // This is only a truncation if the original element type is larger.
9813 if (V0EltSize <= EltSize)
9816 assert(((V0EltSize % EltSize) == 0) &&
9817 "Scalar type sizes must all be powers of 2 on x86!");
9819 const unsigned V0Opc = V0.getOpcode();
9820 const unsigned Scale = V0EltSize / EltSize;
9821 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9823 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9824 V0Opc != ISD::BUILD_VECTOR)
9827 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9829 // If we're extracting non-least-significant bits, shift so we can truncate.
9830 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9831 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9832 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9833 if (const int OffsetIdx = BroadcastIdx % Scale)
9834 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9835 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9837 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9838 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9841 /// \brief Try to lower broadcast of a single element.
9843 /// For convenience, this code also bundles all of the subtarget feature set
9844 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9845 /// a convenient way to factor it out.
9846 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9847 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9848 SDValue V1, SDValue V2,
9850 const X86Subtarget &Subtarget,
9851 SelectionDAG &DAG) {
9852 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9853 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9854 (Subtarget.hasAVX2() && VT.isInteger())))
9857 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9858 // we can only broadcast from a register with AVX2.
9859 unsigned NumElts = Mask.size();
9860 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9861 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9863 // Check that the mask is a broadcast.
9864 int BroadcastIdx = -1;
9865 for (int i = 0; i != (int)NumElts; ++i) {
9866 SmallVector<int, 8> BroadcastMask(NumElts, i);
9867 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9873 if (BroadcastIdx < 0)
9875 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9876 "a sorted mask where the broadcast "
9879 // Go up the chain of (vector) values to find a scalar load that we can
9880 // combine with the broadcast.
9883 switch (V.getOpcode()) {
9884 case ISD::BITCAST: {
9885 SDValue VSrc = V.getOperand(0);
9886 MVT SrcVT = VSrc.getSimpleValueType();
9887 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9892 case ISD::CONCAT_VECTORS: {
9893 int OperandSize = Mask.size() / V.getNumOperands();
9894 V = V.getOperand(BroadcastIdx / OperandSize);
9895 BroadcastIdx %= OperandSize;
9898 case ISD::INSERT_SUBVECTOR: {
9899 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9900 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9904 int BeginIdx = (int)ConstantIdx->getZExtValue();
9906 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9907 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9908 BroadcastIdx -= BeginIdx;
9919 // Check if this is a broadcast of a scalar. We special case lowering
9920 // for scalars so that we can more effectively fold with loads.
9921 // First, look through bitcast: if the original value has a larger element
9922 // type than the shuffle, the broadcast element is in essence truncated.
9923 // Make that explicit to ease folding.
9924 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9925 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9926 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9927 return TruncBroadcast;
9929 MVT BroadcastVT = VT;
9931 // Peek through any bitcast (only useful for loads).
9932 SDValue BC = peekThroughBitcasts(V);
9934 // Also check the simpler case, where we can directly reuse the scalar.
9935 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9936 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9937 V = V.getOperand(BroadcastIdx);
9939 // If we can't broadcast from a register, check that the input is a load.
9940 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9942 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9943 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9944 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9945 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9946 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9949 // If we are broadcasting a load that is only used by the shuffle
9950 // then we can reduce the vector load to the broadcasted scalar load.
9951 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9952 SDValue BaseAddr = Ld->getOperand(1);
9953 EVT SVT = BroadcastVT.getScalarType();
9954 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9955 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9956 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9957 DAG.getMachineFunction().getMachineMemOperand(
9958 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9960 // Make sure the newly-created LOAD is in the same position as Ld in
9961 // terms of dependency. We create a TokenFactor for Ld and V,
9962 // and update uses of Ld's output chain to use the TokenFactor.
9963 if (Ld->hasAnyUseOfValue(1)) {
9964 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9965 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9966 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9967 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9968 SDValue(V.getNode(), 1));
9970 } else if (!BroadcastFromReg) {
9971 // We can't broadcast from a vector register.
9973 } else if (BroadcastIdx != 0) {
9974 // We can only broadcast from the zero-element of a vector register,
9975 // but it can be advantageous to broadcast from the zero-element of a
9977 if (!VT.is256BitVector() && !VT.is512BitVector())
9980 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9981 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9984 // Only broadcast the zero-element of a 128-bit subvector.
9985 unsigned EltSize = VT.getScalarSizeInBits();
9986 if (((BroadcastIdx * EltSize) % 128) != 0)
9989 // The shuffle input might have been a bitcast we looked through; look at
9990 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
9991 // later bitcast it to BroadcastVT.
9992 MVT SrcVT = V.getSimpleValueType();
9993 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9994 "Unexpected vector element size");
9995 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9996 "Unexpected vector size");
9998 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9999 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10000 DAG.getIntPtrConstant(BroadcastIdx, DL));
10003 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10004 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10005 DAG.getBitcast(MVT::f64, V));
10007 // Bitcast back to the same scalar type as BroadcastVT.
10008 MVT SrcVT = V.getSimpleValueType();
10009 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10010 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10011 "Unexpected vector element size");
10012 if (SrcVT.isVector()) {
10013 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10014 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10016 SrcVT = BroadcastVT.getScalarType();
10018 V = DAG.getBitcast(SrcVT, V);
10021 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10022 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10023 V = DAG.getBitcast(MVT::f64, V);
10024 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10025 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10028 // We only support broadcasting from 128-bit vectors to minimize the
10029 // number of patterns we need to deal with in isel. So extract down to
10031 if (SrcVT.getSizeInBits() > 128)
10032 V = extract128BitVector(V, 0, DAG, DL);
10034 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10037 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10038 // INSERTPS when the V1 elements are already in the correct locations
10039 // because otherwise we can just always use two SHUFPS instructions which
10040 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10041 // perform INSERTPS if a single V1 element is out of place and all V2
10042 // elements are zeroable.
10043 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10044 unsigned &InsertPSMask,
10045 const APInt &Zeroable,
10046 ArrayRef<int> Mask,
10047 SelectionDAG &DAG) {
10048 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10049 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10050 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10052 // Attempt to match INSERTPS with one element from VA or VB being
10053 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10055 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10056 ArrayRef<int> CandidateMask) {
10057 unsigned ZMask = 0;
10058 int VADstIndex = -1;
10059 int VBDstIndex = -1;
10060 bool VAUsedInPlace = false;
10062 for (int i = 0; i < 4; ++i) {
10063 // Synthesize a zero mask from the zeroable elements (includes undefs).
10069 // Flag if we use any VA inputs in place.
10070 if (i == CandidateMask[i]) {
10071 VAUsedInPlace = true;
10075 // We can only insert a single non-zeroable element.
10076 if (VADstIndex >= 0 || VBDstIndex >= 0)
10079 if (CandidateMask[i] < 4) {
10080 // VA input out of place for insertion.
10083 // VB input for insertion.
10088 // Don't bother if we have no (non-zeroable) element for insertion.
10089 if (VADstIndex < 0 && VBDstIndex < 0)
10092 // Determine element insertion src/dst indices. The src index is from the
10093 // start of the inserted vector, not the start of the concatenated vector.
10094 unsigned VBSrcIndex = 0;
10095 if (VADstIndex >= 0) {
10096 // If we have a VA input out of place, we use VA as the V2 element
10097 // insertion and don't use the original V2 at all.
10098 VBSrcIndex = CandidateMask[VADstIndex];
10099 VBDstIndex = VADstIndex;
10102 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10105 // If no V1 inputs are used in place, then the result is created only from
10106 // the zero mask and the V2 insertion - so remove V1 dependency.
10107 if (!VAUsedInPlace)
10108 VA = DAG.getUNDEF(MVT::v4f32);
10110 // Update V1, V2 and InsertPSMask accordingly.
10114 // Insert the V2 element into the desired position.
10115 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10120 if (matchAsInsertPS(V1, V2, Mask))
10123 // Commute and try again.
10124 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10125 ShuffleVectorSDNode::commuteMask(CommutedMask);
10126 if (matchAsInsertPS(V2, V1, CommutedMask))
10132 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10133 SDValue V2, ArrayRef<int> Mask,
10134 const APInt &Zeroable,
10135 SelectionDAG &DAG) {
10136 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10137 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10139 // Attempt to match the insertps pattern.
10140 unsigned InsertPSMask;
10141 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10144 // Insert the V2 element into the desired position.
10145 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10146 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10149 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10150 /// UNPCK instruction.
10152 /// This specifically targets cases where we end up with alternating between
10153 /// the two inputs, and so can permute them into something that feeds a single
10154 /// UNPCK instruction. Note that this routine only targets integer vectors
10155 /// because for floating point vectors we have a generalized SHUFPS lowering
10156 /// strategy that handles everything that doesn't *exactly* match an unpack,
10157 /// making this clever lowering unnecessary.
10158 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10159 SDValue V1, SDValue V2,
10160 ArrayRef<int> Mask,
10161 SelectionDAG &DAG) {
10162 assert(!VT.isFloatingPoint() &&
10163 "This routine only supports integer vectors.");
10164 assert(VT.is128BitVector() &&
10165 "This routine only works on 128-bit vectors.");
10166 assert(!V2.isUndef() &&
10167 "This routine should only be used when blending two inputs.");
10168 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10170 int Size = Mask.size();
10173 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10175 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10177 bool UnpackLo = NumLoInputs >= NumHiInputs;
10179 auto TryUnpack = [&](int ScalarSize, int Scale) {
10180 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10181 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10183 for (int i = 0; i < Size; ++i) {
10187 // Each element of the unpack contains Scale elements from this mask.
10188 int UnpackIdx = i / Scale;
10190 // We only handle the case where V1 feeds the first slots of the unpack.
10191 // We rely on canonicalization to ensure this is the case.
10192 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10195 // Setup the mask for this input. The indexing is tricky as we have to
10196 // handle the unpack stride.
10197 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10198 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10202 // If we will have to shuffle both inputs to use the unpack, check whether
10203 // we can just unpack first and shuffle the result. If so, skip this unpack.
10204 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10205 !isNoopShuffleMask(V2Mask))
10208 // Shuffle the inputs into place.
10209 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10210 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10212 // Cast the inputs to the type we will use to unpack them.
10213 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10214 V1 = DAG.getBitcast(UnpackVT, V1);
10215 V2 = DAG.getBitcast(UnpackVT, V2);
10217 // Unpack the inputs and cast the result back to the desired type.
10218 return DAG.getBitcast(
10219 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10220 UnpackVT, V1, V2));
10223 // We try each unpack from the largest to the smallest to try and find one
10224 // that fits this mask.
10225 int OrigScalarSize = VT.getScalarSizeInBits();
10226 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10227 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10230 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10232 if (NumLoInputs == 0 || NumHiInputs == 0) {
10233 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10234 "We have to have *some* inputs!");
10235 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10237 // FIXME: We could consider the total complexity of the permute of each
10238 // possible unpacking. Or at the least we should consider how many
10239 // half-crossings are created.
10240 // FIXME: We could consider commuting the unpacks.
10242 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10243 for (int i = 0; i < Size; ++i) {
10247 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10250 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10252 return DAG.getVectorShuffle(
10253 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10255 DAG.getUNDEF(VT), PermMask);
10261 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10263 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10264 /// support for floating point shuffles but not integer shuffles. These
10265 /// instructions will incur a domain crossing penalty on some chips though so
10266 /// it is better to avoid lowering through this for integer vectors where
10268 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10269 const APInt &Zeroable,
10270 SDValue V1, SDValue V2,
10271 const X86Subtarget &Subtarget,
10272 SelectionDAG &DAG) {
10273 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10274 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10275 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10277 if (V2.isUndef()) {
10278 // Check for being able to broadcast a single element.
10279 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10280 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10283 // Straight shuffle of a single input vector. Simulate this by using the
10284 // single input as both of the "inputs" to this instruction..
10285 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10287 if (Subtarget.hasAVX()) {
10288 // If we have AVX, we can use VPERMILPS which will allow folding a load
10289 // into the shuffle.
10290 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10291 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10294 return DAG.getNode(
10295 X86ISD::SHUFP, DL, MVT::v2f64,
10296 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10297 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10298 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10300 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10301 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10303 // If we have a single input, insert that into V1 if we can do so cheaply.
10304 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10305 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10306 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10308 // Try inverting the insertion since for v2 masks it is easy to do and we
10309 // can't reliably sort the mask one way or the other.
10310 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10311 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10312 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10313 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10317 // Try to use one of the special instruction patterns to handle two common
10318 // blend patterns if a zero-blend above didn't work.
10319 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10320 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10321 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10322 // We can either use a special instruction to load over the low double or
10323 // to move just the low double.
10324 return DAG.getNode(
10325 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10326 DL, MVT::v2f64, V2,
10327 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10329 if (Subtarget.hasSSE41())
10330 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10331 Zeroable, Subtarget, DAG))
10334 // Use dedicated unpack instructions for masks that match their pattern.
10336 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10339 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10340 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10341 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10344 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10346 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10347 /// the integer unit to minimize domain crossing penalties. However, for blends
10348 /// it falls back to the floating point shuffle operation with appropriate bit
10350 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10351 const APInt &Zeroable,
10352 SDValue V1, SDValue V2,
10353 const X86Subtarget &Subtarget,
10354 SelectionDAG &DAG) {
10355 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10356 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10357 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10359 if (V2.isUndef()) {
10360 // Check for being able to broadcast a single element.
10361 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10362 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10365 // Straight shuffle of a single input vector. For everything from SSE2
10366 // onward this has a single fast instruction with no scary immediates.
10367 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10368 V1 = DAG.getBitcast(MVT::v4i32, V1);
10369 int WidenedMask[4] = {
10370 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10371 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10372 return DAG.getBitcast(
10374 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10375 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10377 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10378 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10379 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10380 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10382 // If we have a blend of two same-type PACKUS operations and the blend aligns
10383 // with the low and high halves, we can just merge the PACKUS operations.
10384 // This is particularly important as it lets us merge shuffles that this
10385 // routine itself creates.
10386 auto GetPackNode = [](SDValue V) {
10387 V = peekThroughBitcasts(V);
10388 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10390 if (SDValue V1Pack = GetPackNode(V1))
10391 if (SDValue V2Pack = GetPackNode(V2)) {
10392 EVT PackVT = V1Pack.getValueType();
10393 if (PackVT == V2Pack.getValueType())
10394 return DAG.getBitcast(MVT::v2i64,
10395 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10396 Mask[0] == 0 ? V1Pack.getOperand(0)
10397 : V1Pack.getOperand(1),
10398 Mask[1] == 2 ? V2Pack.getOperand(0)
10399 : V2Pack.getOperand(1)));
10402 // Try to use shift instructions.
10403 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10404 Zeroable, Subtarget, DAG))
10407 // When loading a scalar and then shuffling it into a vector we can often do
10408 // the insertion cheaply.
10409 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10410 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10412 // Try inverting the insertion since for v2 masks it is easy to do and we
10413 // can't reliably sort the mask one way or the other.
10414 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10415 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10416 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10419 // We have different paths for blend lowering, but they all must use the
10420 // *exact* same predicate.
10421 bool IsBlendSupported = Subtarget.hasSSE41();
10422 if (IsBlendSupported)
10423 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10424 Zeroable, Subtarget, DAG))
10427 // Use dedicated unpack instructions for masks that match their pattern.
10429 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10432 // Try to use byte rotation instructions.
10433 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10434 if (Subtarget.hasSSSE3())
10435 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10436 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10439 // If we have direct support for blends, we should lower by decomposing into
10440 // a permute. That will be faster than the domain cross.
10441 if (IsBlendSupported)
10442 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10445 // We implement this with SHUFPD which is pretty lame because it will likely
10446 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10447 // However, all the alternatives are still more cycles and newer chips don't
10448 // have this problem. It would be really nice if x86 had better shuffles here.
10449 V1 = DAG.getBitcast(MVT::v2f64, V1);
10450 V2 = DAG.getBitcast(MVT::v2f64, V2);
10451 return DAG.getBitcast(MVT::v2i64,
10452 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10455 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10457 /// This is used to disable more specialized lowerings when the shufps lowering
10458 /// will happen to be efficient.
10459 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10460 // This routine only handles 128-bit shufps.
10461 assert(Mask.size() == 4 && "Unsupported mask size!");
10462 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10463 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10464 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10465 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10467 // To lower with a single SHUFPS we need to have the low half and high half
10468 // each requiring a single input.
10469 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10471 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10477 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10479 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10480 /// It makes no assumptions about whether this is the *best* lowering, it simply
10482 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10483 ArrayRef<int> Mask, SDValue V1,
10484 SDValue V2, SelectionDAG &DAG) {
10485 SDValue LowV = V1, HighV = V2;
10486 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10488 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10490 if (NumV2Elements == 1) {
10491 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10493 // Compute the index adjacent to V2Index and in the same half by toggling
10495 int V2AdjIndex = V2Index ^ 1;
10497 if (Mask[V2AdjIndex] < 0) {
10498 // Handles all the cases where we have a single V2 element and an undef.
10499 // This will only ever happen in the high lanes because we commute the
10500 // vector otherwise.
10502 std::swap(LowV, HighV);
10503 NewMask[V2Index] -= 4;
10505 // Handle the case where the V2 element ends up adjacent to a V1 element.
10506 // To make this work, blend them together as the first step.
10507 int V1Index = V2AdjIndex;
10508 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10509 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10510 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10512 // Now proceed to reconstruct the final blend as we have the necessary
10513 // high or low half formed.
10520 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10521 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10523 } else if (NumV2Elements == 2) {
10524 if (Mask[0] < 4 && Mask[1] < 4) {
10525 // Handle the easy case where we have V1 in the low lanes and V2 in the
10529 } else if (Mask[2] < 4 && Mask[3] < 4) {
10530 // We also handle the reversed case because this utility may get called
10531 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10532 // arrange things in the right direction.
10538 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10539 // trying to place elements directly, just blend them and set up the final
10540 // shuffle to place them.
10542 // The first two blend mask elements are for V1, the second two are for
10544 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10545 Mask[2] < 4 ? Mask[2] : Mask[3],
10546 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10547 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10548 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10549 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10551 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10554 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10555 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10556 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10557 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10560 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10561 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10564 /// \brief Lower 4-lane 32-bit floating point shuffles.
10566 /// Uses instructions exclusively from the floating point unit to minimize
10567 /// domain crossing penalties, as these are sufficient to implement all v4f32
10569 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10570 const APInt &Zeroable,
10571 SDValue V1, SDValue V2,
10572 const X86Subtarget &Subtarget,
10573 SelectionDAG &DAG) {
10574 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10575 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10576 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10578 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10580 if (NumV2Elements == 0) {
10581 // Check for being able to broadcast a single element.
10582 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10583 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10586 // Use even/odd duplicate instructions for masks that match their pattern.
10587 if (Subtarget.hasSSE3()) {
10588 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10589 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10590 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10591 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10594 if (Subtarget.hasAVX()) {
10595 // If we have AVX, we can use VPERMILPS which will allow folding a load
10596 // into the shuffle.
10597 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10598 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10601 // Otherwise, use a straight shuffle of a single input vector. We pass the
10602 // input vector to both operands to simulate this with a SHUFPS.
10603 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10604 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10607 // There are special ways we can lower some single-element blends. However, we
10608 // have custom ways we can lower more complex single-element blends below that
10609 // we defer to if both this and BLENDPS fail to match, so restrict this to
10610 // when the V2 input is targeting element 0 of the mask -- that is the fast
10612 if (NumV2Elements == 1 && Mask[0] >= 4)
10613 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10614 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10617 if (Subtarget.hasSSE41()) {
10618 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10619 Zeroable, Subtarget, DAG))
10622 // Use INSERTPS if we can complete the shuffle efficiently.
10624 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10627 if (!isSingleSHUFPSMask(Mask))
10628 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10629 DL, MVT::v4f32, V1, V2, Mask, DAG))
10633 // Use low/high mov instructions.
10634 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10635 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10636 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10637 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10639 // Use dedicated unpack instructions for masks that match their pattern.
10641 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10644 // Otherwise fall back to a SHUFPS lowering strategy.
10645 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10648 /// \brief Lower 4-lane i32 vector shuffles.
10650 /// We try to handle these with integer-domain shuffles where we can, but for
10651 /// blends we use the floating point domain blend instructions.
10652 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10653 const APInt &Zeroable,
10654 SDValue V1, SDValue V2,
10655 const X86Subtarget &Subtarget,
10656 SelectionDAG &DAG) {
10657 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10658 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10659 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10661 // Whenever we can lower this as a zext, that instruction is strictly faster
10662 // than any alternative. It also allows us to fold memory operands into the
10663 // shuffle in many cases.
10664 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10665 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10668 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10670 if (NumV2Elements == 0) {
10671 // Check for being able to broadcast a single element.
10672 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10673 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10676 // Straight shuffle of a single input vector. For everything from SSE2
10677 // onward this has a single fast instruction with no scary immediates.
10678 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10679 // but we aren't actually going to use the UNPCK instruction because doing
10680 // so prevents folding a load into this instruction or making a copy.
10681 const int UnpackLoMask[] = {0, 0, 1, 1};
10682 const int UnpackHiMask[] = {2, 2, 3, 3};
10683 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10684 Mask = UnpackLoMask;
10685 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10686 Mask = UnpackHiMask;
10688 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10689 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10692 // Try to use shift instructions.
10693 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10694 Zeroable, Subtarget, DAG))
10697 // There are special ways we can lower some single-element blends.
10698 if (NumV2Elements == 1)
10699 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10700 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10703 // We have different paths for blend lowering, but they all must use the
10704 // *exact* same predicate.
10705 bool IsBlendSupported = Subtarget.hasSSE41();
10706 if (IsBlendSupported)
10707 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10708 Zeroable, Subtarget, DAG))
10711 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10715 // Use dedicated unpack instructions for masks that match their pattern.
10717 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10720 // Try to use byte rotation instructions.
10721 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10722 if (Subtarget.hasSSSE3())
10723 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10724 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10727 // Assume that a single SHUFPS is faster than an alternative sequence of
10728 // multiple instructions (even if the CPU has a domain penalty).
10729 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10730 if (!isSingleSHUFPSMask(Mask)) {
10731 // If we have direct support for blends, we should lower by decomposing into
10732 // a permute. That will be faster than the domain cross.
10733 if (IsBlendSupported)
10734 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10737 // Try to lower by permuting the inputs into an unpack instruction.
10738 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10739 DL, MVT::v4i32, V1, V2, Mask, DAG))
10743 // We implement this with SHUFPS because it can blend from two vectors.
10744 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10745 // up the inputs, bypassing domain shift penalties that we would incur if we
10746 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10748 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10749 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10750 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10751 return DAG.getBitcast(MVT::v4i32, ShufPS);
10754 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10755 /// shuffle lowering, and the most complex part.
10757 /// The lowering strategy is to try to form pairs of input lanes which are
10758 /// targeted at the same half of the final vector, and then use a dword shuffle
10759 /// to place them onto the right half, and finally unpack the paired lanes into
10760 /// their final position.
10762 /// The exact breakdown of how to form these dword pairs and align them on the
10763 /// correct sides is really tricky. See the comments within the function for
10764 /// more of the details.
10766 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10767 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10768 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10769 /// vector, form the analogous 128-bit 8-element Mask.
10770 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10771 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10772 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10773 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10774 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10776 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10777 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10778 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10780 SmallVector<int, 4> LoInputs;
10781 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10782 std::sort(LoInputs.begin(), LoInputs.end());
10783 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10784 SmallVector<int, 4> HiInputs;
10785 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10786 std::sort(HiInputs.begin(), HiInputs.end());
10787 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10789 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10790 int NumHToL = LoInputs.size() - NumLToL;
10792 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10793 int NumHToH = HiInputs.size() - NumLToH;
10794 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10795 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10796 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10797 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10799 // If we are splatting two values from one half - one to each half, then
10800 // we can shuffle that half so each is splatted to a dword, then splat those
10801 // to their respective halves.
10802 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10804 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10805 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10806 V = DAG.getNode(ShufWOp, DL, VT, V,
10807 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10808 V = DAG.getBitcast(PSHUFDVT, V);
10809 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10810 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10811 return DAG.getBitcast(VT, V);
10814 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10815 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10816 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10817 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10819 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10820 // such inputs we can swap two of the dwords across the half mark and end up
10821 // with <=2 inputs to each half in each half. Once there, we can fall through
10822 // to the generic code below. For example:
10824 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10825 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10827 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10828 // and an existing 2-into-2 on the other half. In this case we may have to
10829 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10830 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10831 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10832 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10833 // half than the one we target for fixing) will be fixed when we re-enter this
10834 // path. We will also combine away any sequence of PSHUFD instructions that
10835 // result into a single instruction. Here is an example of the tricky case:
10837 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10838 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10840 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10842 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10843 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10845 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10846 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10848 // The result is fine to be handled by the generic logic.
10849 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10850 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10851 int AOffset, int BOffset) {
10852 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10853 "Must call this with A having 3 or 1 inputs from the A half.");
10854 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10855 "Must call this with B having 1 or 3 inputs from the B half.");
10856 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10857 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10859 bool ThreeAInputs = AToAInputs.size() == 3;
10861 // Compute the index of dword with only one word among the three inputs in
10862 // a half by taking the sum of the half with three inputs and subtracting
10863 // the sum of the actual three inputs. The difference is the remaining
10865 int ADWord, BDWord;
10866 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10867 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10868 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10869 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10870 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10871 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10872 int TripleNonInputIdx =
10873 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10874 TripleDWord = TripleNonInputIdx / 2;
10876 // We use xor with one to compute the adjacent DWord to whichever one the
10878 OneInputDWord = (OneInput / 2) ^ 1;
10880 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10881 // and BToA inputs. If there is also such a problem with the BToB and AToB
10882 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10883 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10884 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10885 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10886 // Compute how many inputs will be flipped by swapping these DWords. We
10888 // to balance this to ensure we don't form a 3-1 shuffle in the other
10890 int NumFlippedAToBInputs =
10891 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10892 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10893 int NumFlippedBToBInputs =
10894 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10895 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10896 if ((NumFlippedAToBInputs == 1 &&
10897 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10898 (NumFlippedBToBInputs == 1 &&
10899 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10900 // We choose whether to fix the A half or B half based on whether that
10901 // half has zero flipped inputs. At zero, we may not be able to fix it
10902 // with that half. We also bias towards fixing the B half because that
10903 // will more commonly be the high half, and we have to bias one way.
10904 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10905 ArrayRef<int> Inputs) {
10906 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10907 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10908 // Determine whether the free index is in the flipped dword or the
10909 // unflipped dword based on where the pinned index is. We use this bit
10910 // in an xor to conditionally select the adjacent dword.
10911 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10912 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10913 if (IsFixIdxInput == IsFixFreeIdxInput)
10915 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10916 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10917 "We need to be changing the number of flipped inputs!");
10918 int PSHUFHalfMask[] = {0, 1, 2, 3};
10919 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10920 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10922 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10924 for (int &M : Mask)
10925 if (M >= 0 && M == FixIdx)
10927 else if (M >= 0 && M == FixFreeIdx)
10930 if (NumFlippedBToBInputs != 0) {
10932 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10933 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10935 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10936 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10937 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10942 int PSHUFDMask[] = {0, 1, 2, 3};
10943 PSHUFDMask[ADWord] = BDWord;
10944 PSHUFDMask[BDWord] = ADWord;
10945 V = DAG.getBitcast(
10947 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10948 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10950 // Adjust the mask to match the new locations of A and B.
10951 for (int &M : Mask)
10952 if (M >= 0 && M/2 == ADWord)
10953 M = 2 * BDWord + M % 2;
10954 else if (M >= 0 && M/2 == BDWord)
10955 M = 2 * ADWord + M % 2;
10957 // Recurse back into this routine to re-compute state now that this isn't
10958 // a 3 and 1 problem.
10959 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10962 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10963 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10964 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10965 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10967 // At this point there are at most two inputs to the low and high halves from
10968 // each half. That means the inputs can always be grouped into dwords and
10969 // those dwords can then be moved to the correct half with a dword shuffle.
10970 // We use at most one low and one high word shuffle to collect these paired
10971 // inputs into dwords, and finally a dword shuffle to place them.
10972 int PSHUFLMask[4] = {-1, -1, -1, -1};
10973 int PSHUFHMask[4] = {-1, -1, -1, -1};
10974 int PSHUFDMask[4] = {-1, -1, -1, -1};
10976 // First fix the masks for all the inputs that are staying in their
10977 // original halves. This will then dictate the targets of the cross-half
10979 auto fixInPlaceInputs =
10980 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10981 MutableArrayRef<int> SourceHalfMask,
10982 MutableArrayRef<int> HalfMask, int HalfOffset) {
10983 if (InPlaceInputs.empty())
10985 if (InPlaceInputs.size() == 1) {
10986 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10987 InPlaceInputs[0] - HalfOffset;
10988 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10991 if (IncomingInputs.empty()) {
10992 // Just fix all of the in place inputs.
10993 for (int Input : InPlaceInputs) {
10994 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10995 PSHUFDMask[Input / 2] = Input / 2;
11000 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11001 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11002 InPlaceInputs[0] - HalfOffset;
11003 // Put the second input next to the first so that they are packed into
11004 // a dword. We find the adjacent index by toggling the low bit.
11005 int AdjIndex = InPlaceInputs[0] ^ 1;
11006 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11007 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11008 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11010 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11011 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11013 // Now gather the cross-half inputs and place them into a free dword of
11014 // their target half.
11015 // FIXME: This operation could almost certainly be simplified dramatically to
11016 // look more like the 3-1 fixing operation.
11017 auto moveInputsToRightHalf = [&PSHUFDMask](
11018 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11019 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11020 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11022 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11023 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11025 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11027 int LowWord = Word & ~1;
11028 int HighWord = Word | 1;
11029 return isWordClobbered(SourceHalfMask, LowWord) ||
11030 isWordClobbered(SourceHalfMask, HighWord);
11033 if (IncomingInputs.empty())
11036 if (ExistingInputs.empty()) {
11037 // Map any dwords with inputs from them into the right half.
11038 for (int Input : IncomingInputs) {
11039 // If the source half mask maps over the inputs, turn those into
11040 // swaps and use the swapped lane.
11041 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11042 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11043 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11044 Input - SourceOffset;
11045 // We have to swap the uses in our half mask in one sweep.
11046 for (int &M : HalfMask)
11047 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11049 else if (M == Input)
11050 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11052 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11053 Input - SourceOffset &&
11054 "Previous placement doesn't match!");
11056 // Note that this correctly re-maps both when we do a swap and when
11057 // we observe the other side of the swap above. We rely on that to
11058 // avoid swapping the members of the input list directly.
11059 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11062 // Map the input's dword into the correct half.
11063 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11064 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11066 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11068 "Previous placement doesn't match!");
11071 // And just directly shift any other-half mask elements to be same-half
11072 // as we will have mirrored the dword containing the element into the
11073 // same position within that half.
11074 for (int &M : HalfMask)
11075 if (M >= SourceOffset && M < SourceOffset + 4) {
11076 M = M - SourceOffset + DestOffset;
11077 assert(M >= 0 && "This should never wrap below zero!");
11082 // Ensure we have the input in a viable dword of its current half. This
11083 // is particularly tricky because the original position may be clobbered
11084 // by inputs being moved and *staying* in that half.
11085 if (IncomingInputs.size() == 1) {
11086 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11087 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11089 SourceHalfMask[InputFixed - SourceOffset] =
11090 IncomingInputs[0] - SourceOffset;
11091 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11093 IncomingInputs[0] = InputFixed;
11095 } else if (IncomingInputs.size() == 2) {
11096 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11097 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11098 // We have two non-adjacent or clobbered inputs we need to extract from
11099 // the source half. To do this, we need to map them into some adjacent
11100 // dword slot in the source mask.
11101 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11102 IncomingInputs[1] - SourceOffset};
11104 // If there is a free slot in the source half mask adjacent to one of
11105 // the inputs, place the other input in it. We use (Index XOR 1) to
11106 // compute an adjacent index.
11107 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11108 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11109 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11110 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11111 InputsFixed[1] = InputsFixed[0] ^ 1;
11112 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11113 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11114 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11115 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11116 InputsFixed[0] = InputsFixed[1] ^ 1;
11117 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11118 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11119 // The two inputs are in the same DWord but it is clobbered and the
11120 // adjacent DWord isn't used at all. Move both inputs to the free
11122 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11123 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11124 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11125 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11127 // The only way we hit this point is if there is no clobbering
11128 // (because there are no off-half inputs to this half) and there is no
11129 // free slot adjacent to one of the inputs. In this case, we have to
11130 // swap an input with a non-input.
11131 for (int i = 0; i < 4; ++i)
11132 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11133 "We can't handle any clobbers here!");
11134 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11135 "Cannot have adjacent inputs here!");
11137 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11138 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11140 // We also have to update the final source mask in this case because
11141 // it may need to undo the above swap.
11142 for (int &M : FinalSourceHalfMask)
11143 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11144 M = InputsFixed[1] + SourceOffset;
11145 else if (M == InputsFixed[1] + SourceOffset)
11146 M = (InputsFixed[0] ^ 1) + SourceOffset;
11148 InputsFixed[1] = InputsFixed[0] ^ 1;
11151 // Point everything at the fixed inputs.
11152 for (int &M : HalfMask)
11153 if (M == IncomingInputs[0])
11154 M = InputsFixed[0] + SourceOffset;
11155 else if (M == IncomingInputs[1])
11156 M = InputsFixed[1] + SourceOffset;
11158 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11159 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11162 llvm_unreachable("Unhandled input size!");
11165 // Now hoist the DWord down to the right half.
11166 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11167 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11168 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11169 for (int &M : HalfMask)
11170 for (int Input : IncomingInputs)
11172 M = FreeDWord * 2 + Input % 2;
11174 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11175 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11176 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11177 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11179 // Now enact all the shuffles we've computed to move the inputs into their
11181 if (!isNoopShuffleMask(PSHUFLMask))
11182 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11183 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11184 if (!isNoopShuffleMask(PSHUFHMask))
11185 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11186 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11187 if (!isNoopShuffleMask(PSHUFDMask))
11188 V = DAG.getBitcast(
11190 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11191 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11193 // At this point, each half should contain all its inputs, and we can then
11194 // just shuffle them into their final position.
11195 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11196 "Failed to lift all the high half inputs to the low mask!");
11197 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11198 "Failed to lift all the low half inputs to the high mask!");
11200 // Do a half shuffle for the low mask.
11201 if (!isNoopShuffleMask(LoMask))
11202 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11203 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11205 // Do a half shuffle with the high mask after shifting its values down.
11206 for (int &M : HiMask)
11209 if (!isNoopShuffleMask(HiMask))
11210 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11211 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11216 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11217 /// blend if only one input is used.
11218 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11219 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11220 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11222 SDValue V1Mask[16];
11223 SDValue V2Mask[16];
11227 int Size = Mask.size();
11228 int Scale = 16 / Size;
11229 for (int i = 0; i < 16; ++i) {
11230 if (Mask[i / Scale] < 0) {
11231 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11233 const int ZeroMask = 0x80;
11234 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11236 int V2Idx = Mask[i / Scale] < Size
11238 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11239 if (Zeroable[i / Scale])
11240 V1Idx = V2Idx = ZeroMask;
11241 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11242 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11243 V1InUse |= (ZeroMask != V1Idx);
11244 V2InUse |= (ZeroMask != V2Idx);
11249 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11250 DAG.getBitcast(MVT::v16i8, V1),
11251 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11253 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11254 DAG.getBitcast(MVT::v16i8, V2),
11255 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11257 // If we need shuffled inputs from both, blend the two.
11259 if (V1InUse && V2InUse)
11260 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11262 V = V1InUse ? V1 : V2;
11264 // Cast the result back to the correct type.
11265 return DAG.getBitcast(VT, V);
11268 /// \brief Generic lowering of 8-lane i16 shuffles.
11270 /// This handles both single-input shuffles and combined shuffle/blends with
11271 /// two inputs. The single input shuffles are immediately delegated to
11272 /// a dedicated lowering routine.
11274 /// The blends are lowered in one of three fundamental ways. If there are few
11275 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11276 /// of the input is significantly cheaper when lowered as an interleaving of
11277 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11278 /// halves of the inputs separately (making them have relatively few inputs)
11279 /// and then concatenate them.
11280 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11281 const APInt &Zeroable,
11282 SDValue V1, SDValue V2,
11283 const X86Subtarget &Subtarget,
11284 SelectionDAG &DAG) {
11285 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11286 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11287 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11289 // Whenever we can lower this as a zext, that instruction is strictly faster
11290 // than any alternative.
11291 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11292 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11295 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11297 if (NumV2Inputs == 0) {
11298 // Check for being able to broadcast a single element.
11299 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11300 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11303 // Try to use shift instructions.
11304 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11305 Zeroable, Subtarget, DAG))
11308 // Use dedicated unpack instructions for masks that match their pattern.
11310 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11313 // Try to use byte rotation instructions.
11314 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11315 Mask, Subtarget, DAG))
11318 // Make a copy of the mask so it can be modified.
11319 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11320 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11321 MutableMask, Subtarget,
11325 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11326 "All single-input shuffles should be canonicalized to be V1-input "
11329 // Try to use shift instructions.
11330 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11331 Zeroable, Subtarget, DAG))
11334 // See if we can use SSE4A Extraction / Insertion.
11335 if (Subtarget.hasSSE4A())
11336 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11340 // There are special ways we can lower some single-element blends.
11341 if (NumV2Inputs == 1)
11342 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11343 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11346 // We have different paths for blend lowering, but they all must use the
11347 // *exact* same predicate.
11348 bool IsBlendSupported = Subtarget.hasSSE41();
11349 if (IsBlendSupported)
11350 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11351 Zeroable, Subtarget, DAG))
11354 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11358 // Use dedicated unpack instructions for masks that match their pattern.
11360 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11363 // Try to use byte rotation instructions.
11364 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11365 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11368 if (SDValue BitBlend =
11369 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11372 // Try to lower by permuting the inputs into an unpack instruction.
11373 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11377 // If we can't directly blend but can use PSHUFB, that will be better as it
11378 // can both shuffle and set up the inefficient blend.
11379 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11380 bool V1InUse, V2InUse;
11381 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11382 Zeroable, DAG, V1InUse, V2InUse);
11385 // We can always bit-blend if we have to so the fallback strategy is to
11386 // decompose into single-input permutes and blends.
11387 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11391 /// \brief Check whether a compaction lowering can be done by dropping even
11392 /// elements and compute how many times even elements must be dropped.
11394 /// This handles shuffles which take every Nth element where N is a power of
11395 /// two. Example shuffle masks:
11397 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11398 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11399 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11400 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11401 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11402 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11404 /// Any of these lanes can of course be undef.
11406 /// This routine only supports N <= 3.
11407 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11410 /// \returns N above, or the number of times even elements must be dropped if
11411 /// there is such a number. Otherwise returns zero.
11412 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11413 bool IsSingleInput) {
11414 // The modulus for the shuffle vector entries is based on whether this is
11415 // a single input or not.
11416 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11417 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11418 "We should only be called with masks with a power-of-2 size!");
11420 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11422 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11423 // and 2^3 simultaneously. This is because we may have ambiguity with
11424 // partially undef inputs.
11425 bool ViableForN[3] = {true, true, true};
11427 for (int i = 0, e = Mask.size(); i < e; ++i) {
11428 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11433 bool IsAnyViable = false;
11434 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11435 if (ViableForN[j]) {
11436 uint64_t N = j + 1;
11438 // The shuffle mask must be equal to (i * 2^N) % M.
11439 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11440 IsAnyViable = true;
11442 ViableForN[j] = false;
11444 // Early exit if we exhaust the possible powers of two.
11449 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11453 // Return 0 as there is no viable power of two.
11457 /// \brief Generic lowering of v16i8 shuffles.
11459 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11460 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11461 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11462 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11464 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11465 const APInt &Zeroable,
11466 SDValue V1, SDValue V2,
11467 const X86Subtarget &Subtarget,
11468 SelectionDAG &DAG) {
11469 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11470 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11471 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11473 // Try to use shift instructions.
11474 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11475 Zeroable, Subtarget, DAG))
11478 // Try to use byte rotation instructions.
11479 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11480 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11483 // Try to use a zext lowering.
11484 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11485 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11488 // See if we can use SSE4A Extraction / Insertion.
11489 if (Subtarget.hasSSE4A())
11490 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11494 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11496 // For single-input shuffles, there are some nicer lowering tricks we can use.
11497 if (NumV2Elements == 0) {
11498 // Check for being able to broadcast a single element.
11499 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11500 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11503 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11504 // Notably, this handles splat and partial-splat shuffles more efficiently.
11505 // However, it only makes sense if the pre-duplication shuffle simplifies
11506 // things significantly. Currently, this means we need to be able to
11507 // express the pre-duplication shuffle as an i16 shuffle.
11509 // FIXME: We should check for other patterns which can be widened into an
11510 // i16 shuffle as well.
11511 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11512 for (int i = 0; i < 16; i += 2)
11513 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11518 auto tryToWidenViaDuplication = [&]() -> SDValue {
11519 if (!canWidenViaDuplication(Mask))
11521 SmallVector<int, 4> LoInputs;
11522 copy_if(Mask, std::back_inserter(LoInputs),
11523 [](int M) { return M >= 0 && M < 8; });
11524 std::sort(LoInputs.begin(), LoInputs.end());
11525 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11527 SmallVector<int, 4> HiInputs;
11528 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11529 std::sort(HiInputs.begin(), HiInputs.end());
11530 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11533 bool TargetLo = LoInputs.size() >= HiInputs.size();
11534 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11535 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11537 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11538 SmallDenseMap<int, int, 8> LaneMap;
11539 for (int I : InPlaceInputs) {
11540 PreDupI16Shuffle[I/2] = I/2;
11543 int j = TargetLo ? 0 : 4, je = j + 4;
11544 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11545 // Check if j is already a shuffle of this input. This happens when
11546 // there are two adjacent bytes after we move the low one.
11547 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11548 // If we haven't yet mapped the input, search for a slot into which
11550 while (j < je && PreDupI16Shuffle[j] >= 0)
11554 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11557 // Map this input with the i16 shuffle.
11558 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11561 // Update the lane map based on the mapping we ended up with.
11562 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11564 V1 = DAG.getBitcast(
11566 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11567 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11569 // Unpack the bytes to form the i16s that will be shuffled into place.
11570 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11571 MVT::v16i8, V1, V1);
11573 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11574 for (int i = 0; i < 16; ++i)
11575 if (Mask[i] >= 0) {
11576 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11577 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11578 if (PostDupI16Shuffle[i / 2] < 0)
11579 PostDupI16Shuffle[i / 2] = MappedMask;
11581 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11582 "Conflicting entries in the original shuffle!");
11584 return DAG.getBitcast(
11586 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11587 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11589 if (SDValue V = tryToWidenViaDuplication())
11593 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11597 // Use dedicated unpack instructions for masks that match their pattern.
11599 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11602 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11603 // with PSHUFB. It is important to do this before we attempt to generate any
11604 // blends but after all of the single-input lowerings. If the single input
11605 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11606 // want to preserve that and we can DAG combine any longer sequences into
11607 // a PSHUFB in the end. But once we start blending from multiple inputs,
11608 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11609 // and there are *very* few patterns that would actually be faster than the
11610 // PSHUFB approach because of its ability to zero lanes.
11612 // FIXME: The only exceptions to the above are blends which are exact
11613 // interleavings with direct instructions supporting them. We currently don't
11614 // handle those well here.
11615 if (Subtarget.hasSSSE3()) {
11616 bool V1InUse = false;
11617 bool V2InUse = false;
11619 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11620 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11622 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11623 // do so. This avoids using them to handle blends-with-zero which is
11624 // important as a single pshufb is significantly faster for that.
11625 if (V1InUse && V2InUse) {
11626 if (Subtarget.hasSSE41())
11627 if (SDValue Blend = lowerVectorShuffleAsBlend(
11628 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11631 // We can use an unpack to do the blending rather than an or in some
11632 // cases. Even though the or may be (very minorly) more efficient, we
11633 // preference this lowering because there are common cases where part of
11634 // the complexity of the shuffles goes away when we do the final blend as
11636 // FIXME: It might be worth trying to detect if the unpack-feeding
11637 // shuffles will both be pshufb, in which case we shouldn't bother with
11639 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11640 DL, MVT::v16i8, V1, V2, Mask, DAG))
11647 // There are special ways we can lower some single-element blends.
11648 if (NumV2Elements == 1)
11649 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11650 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11653 if (SDValue BitBlend =
11654 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11657 // Check whether a compaction lowering can be done. This handles shuffles
11658 // which take every Nth element for some even N. See the helper function for
11661 // We special case these as they can be particularly efficiently handled with
11662 // the PACKUSB instruction on x86 and they show up in common patterns of
11663 // rearranging bytes to truncate wide elements.
11664 bool IsSingleInput = V2.isUndef();
11665 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11666 // NumEvenDrops is the power of two stride of the elements. Another way of
11667 // thinking about it is that we need to drop the even elements this many
11668 // times to get the original input.
11670 // First we need to zero all the dropped bytes.
11671 assert(NumEvenDrops <= 3 &&
11672 "No support for dropping even elements more than 3 times.");
11673 // We use the mask type to pick which bytes are preserved based on how many
11674 // elements are dropped.
11675 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11676 SDValue ByteClearMask = DAG.getBitcast(
11677 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11678 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11679 if (!IsSingleInput)
11680 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11682 // Now pack things back together.
11683 V1 = DAG.getBitcast(MVT::v8i16, V1);
11684 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11685 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11686 for (int i = 1; i < NumEvenDrops; ++i) {
11687 Result = DAG.getBitcast(MVT::v8i16, Result);
11688 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11694 // Handle multi-input cases by blending single-input shuffles.
11695 if (NumV2Elements > 0)
11696 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11699 // The fallback path for single-input shuffles widens this into two v8i16
11700 // vectors with unpacks, shuffles those, and then pulls them back together
11704 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11705 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11706 for (int i = 0; i < 16; ++i)
11708 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11710 SDValue VLoHalf, VHiHalf;
11711 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11712 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11714 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11715 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11716 // Use a mask to drop the high bytes.
11717 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11718 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11719 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11721 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11722 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11724 // Squash the masks to point directly into VLoHalf.
11725 for (int &M : LoBlendMask)
11728 for (int &M : HiBlendMask)
11732 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11733 // VHiHalf so that we can blend them as i16s.
11734 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11736 VLoHalf = DAG.getBitcast(
11737 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11738 VHiHalf = DAG.getBitcast(
11739 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11742 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11743 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11745 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11748 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11750 /// This routine breaks down the specific type of 128-bit shuffle and
11751 /// dispatches to the lowering routines accordingly.
11752 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11753 MVT VT, SDValue V1, SDValue V2,
11754 const APInt &Zeroable,
11755 const X86Subtarget &Subtarget,
11756 SelectionDAG &DAG) {
11757 switch (VT.SimpleTy) {
11759 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11761 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11763 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11765 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11767 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11769 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11772 llvm_unreachable("Unimplemented!");
11776 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11778 /// This routine just extracts two subvectors, shuffles them independently, and
11779 /// then concatenates them back together. This should work effectively with all
11780 /// AVX vector shuffle types.
11781 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11782 SDValue V2, ArrayRef<int> Mask,
11783 SelectionDAG &DAG) {
11784 assert(VT.getSizeInBits() >= 256 &&
11785 "Only for 256-bit or wider vector shuffles!");
11786 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11787 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11789 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11790 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11792 int NumElements = VT.getVectorNumElements();
11793 int SplitNumElements = NumElements / 2;
11794 MVT ScalarVT = VT.getVectorElementType();
11795 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11797 // Rather than splitting build-vectors, just build two narrower build
11798 // vectors. This helps shuffling with splats and zeros.
11799 auto SplitVector = [&](SDValue V) {
11800 V = peekThroughBitcasts(V);
11802 MVT OrigVT = V.getSimpleValueType();
11803 int OrigNumElements = OrigVT.getVectorNumElements();
11804 int OrigSplitNumElements = OrigNumElements / 2;
11805 MVT OrigScalarVT = OrigVT.getVectorElementType();
11806 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11810 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11812 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11813 DAG.getIntPtrConstant(0, DL));
11814 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11815 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11818 SmallVector<SDValue, 16> LoOps, HiOps;
11819 for (int i = 0; i < OrigSplitNumElements; ++i) {
11820 LoOps.push_back(BV->getOperand(i));
11821 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11823 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11824 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11826 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11827 DAG.getBitcast(SplitVT, HiV));
11830 SDValue LoV1, HiV1, LoV2, HiV2;
11831 std::tie(LoV1, HiV1) = SplitVector(V1);
11832 std::tie(LoV2, HiV2) = SplitVector(V2);
11834 // Now create two 4-way blends of these half-width vectors.
11835 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11836 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11837 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11838 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11839 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11840 for (int i = 0; i < SplitNumElements; ++i) {
11841 int M = HalfMask[i];
11842 if (M >= NumElements) {
11843 if (M >= NumElements + SplitNumElements)
11847 V2BlendMask[i] = M - NumElements;
11848 BlendMask[i] = SplitNumElements + i;
11849 } else if (M >= 0) {
11850 if (M >= SplitNumElements)
11854 V1BlendMask[i] = M;
11859 // Because the lowering happens after all combining takes place, we need to
11860 // manually combine these blend masks as much as possible so that we create
11861 // a minimal number of high-level vector shuffle nodes.
11863 // First try just blending the halves of V1 or V2.
11864 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11865 return DAG.getUNDEF(SplitVT);
11866 if (!UseLoV2 && !UseHiV2)
11867 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11868 if (!UseLoV1 && !UseHiV1)
11869 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11871 SDValue V1Blend, V2Blend;
11872 if (UseLoV1 && UseHiV1) {
11874 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11876 // We only use half of V1 so map the usage down into the final blend mask.
11877 V1Blend = UseLoV1 ? LoV1 : HiV1;
11878 for (int i = 0; i < SplitNumElements; ++i)
11879 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11880 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11882 if (UseLoV2 && UseHiV2) {
11884 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11886 // We only use half of V2 so map the usage down into the final blend mask.
11887 V2Blend = UseLoV2 ? LoV2 : HiV2;
11888 for (int i = 0; i < SplitNumElements; ++i)
11889 if (BlendMask[i] >= SplitNumElements)
11890 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11892 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11894 SDValue Lo = HalfBlend(LoMask);
11895 SDValue Hi = HalfBlend(HiMask);
11896 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11899 /// \brief Either split a vector in halves or decompose the shuffles and the
11902 /// This is provided as a good fallback for many lowerings of non-single-input
11903 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11904 /// between splitting the shuffle into 128-bit components and stitching those
11905 /// back together vs. extracting the single-input shuffles and blending those
11907 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11908 SDValue V1, SDValue V2,
11909 ArrayRef<int> Mask,
11910 SelectionDAG &DAG) {
11911 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11912 "shuffles as it could then recurse on itself.");
11913 int Size = Mask.size();
11915 // If this can be modeled as a broadcast of two elements followed by a blend,
11916 // prefer that lowering. This is especially important because broadcasts can
11917 // often fold with memory operands.
11918 auto DoBothBroadcast = [&] {
11919 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11922 if (V2BroadcastIdx < 0)
11923 V2BroadcastIdx = M - Size;
11924 else if (M - Size != V2BroadcastIdx)
11926 } else if (M >= 0) {
11927 if (V1BroadcastIdx < 0)
11928 V1BroadcastIdx = M;
11929 else if (M != V1BroadcastIdx)
11934 if (DoBothBroadcast())
11935 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11938 // If the inputs all stem from a single 128-bit lane of each input, then we
11939 // split them rather than blending because the split will decompose to
11940 // unusually few instructions.
11941 int LaneCount = VT.getSizeInBits() / 128;
11942 int LaneSize = Size / LaneCount;
11943 SmallBitVector LaneInputs[2];
11944 LaneInputs[0].resize(LaneCount, false);
11945 LaneInputs[1].resize(LaneCount, false);
11946 for (int i = 0; i < Size; ++i)
11948 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11949 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11950 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11952 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11953 // that the decomposed single-input shuffles don't end up here.
11954 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11957 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11958 /// a permutation and blend of those lanes.
11960 /// This essentially blends the out-of-lane inputs to each lane into the lane
11961 /// from a permuted copy of the vector. This lowering strategy results in four
11962 /// instructions in the worst case for a single-input cross lane shuffle which
11963 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11964 /// of. Special cases for each particular shuffle pattern should be handled
11965 /// prior to trying this lowering.
11966 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11967 SDValue V1, SDValue V2,
11968 ArrayRef<int> Mask,
11969 SelectionDAG &DAG) {
11970 // FIXME: This should probably be generalized for 512-bit vectors as well.
11971 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11972 int Size = Mask.size();
11973 int LaneSize = Size / 2;
11975 // If there are only inputs from one 128-bit lane, splitting will in fact be
11976 // less expensive. The flags track whether the given lane contains an element
11977 // that crosses to another lane.
11978 bool LaneCrossing[2] = {false, false};
11979 for (int i = 0; i < Size; ++i)
11980 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11981 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11982 if (!LaneCrossing[0] || !LaneCrossing[1])
11983 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11985 assert(V2.isUndef() &&
11986 "This last part of this routine only works on single input shuffles");
11988 SmallVector<int, 32> FlippedBlendMask(Size);
11989 for (int i = 0; i < Size; ++i)
11990 FlippedBlendMask[i] =
11991 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11993 : Mask[i] % LaneSize +
11994 (i / LaneSize) * LaneSize + Size);
11996 // Flip the vector, and blend the results which should now be in-lane. The
11997 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11998 // 5 for the high source. The value 3 selects the high half of source 2 and
11999 // the value 2 selects the low half of source 2. We only use source 2 to
12000 // allow folding it into a memory operand.
12001 unsigned PERMMask = 3 | 2 << 4;
12002 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12003 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12004 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12007 /// \brief Handle lowering 2-lane 128-bit shuffles.
12008 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12009 SDValue V2, ArrayRef<int> Mask,
12010 const APInt &Zeroable,
12011 const X86Subtarget &Subtarget,
12012 SelectionDAG &DAG) {
12013 SmallVector<int, 4> WidenedMask;
12014 if (!canWidenShuffleElements(Mask, WidenedMask))
12017 // TODO: If minimizing size and one of the inputs is a zero vector and the
12018 // the zero vector has only one use, we could use a VPERM2X128 to save the
12019 // instruction bytes needed to explicitly generate the zero vector.
12021 // Blends are faster and handle all the non-lane-crossing cases.
12022 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12023 Zeroable, Subtarget, DAG))
12026 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12027 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12029 // If either input operand is a zero vector, use VPERM2X128 because its mask
12030 // allows us to replace the zero input with an implicit zero.
12031 if (!IsV1Zero && !IsV2Zero) {
12032 // Check for patterns which can be matched with a single insert of a 128-bit
12034 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12035 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12036 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
12037 if (Subtarget.hasAVX2() && V2.isUndef())
12040 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12041 VT.getVectorNumElements() / 2);
12042 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12043 DAG.getIntPtrConstant(0, DL));
12044 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12045 OnlyUsesV1 ? V1 : V2,
12046 DAG.getIntPtrConstant(0, DL));
12047 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12051 // Otherwise form a 128-bit permutation. After accounting for undefs,
12052 // convert the 64-bit shuffle mask selection values into 128-bit
12053 // selection bits by dividing the indexes by 2 and shifting into positions
12054 // defined by a vperm2*128 instruction's immediate control byte.
12056 // The immediate permute control byte looks like this:
12057 // [1:0] - select 128 bits from sources for low half of destination
12059 // [3] - zero low half of destination
12060 // [5:4] - select 128 bits from sources for high half of destination
12062 // [7] - zero high half of destination
12064 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12065 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12067 unsigned PermMask = MaskLO | (MaskHI << 4);
12069 // If either input is a zero vector, replace it with an undef input.
12070 // Shuffle mask values < 4 are selecting elements of V1.
12071 // Shuffle mask values >= 4 are selecting elements of V2.
12072 // Adjust each half of the permute mask by clearing the half that was
12073 // selecting the zero vector and setting the zero mask bit.
12075 V1 = DAG.getUNDEF(VT);
12077 PermMask = (PermMask & 0xf0) | 0x08;
12079 PermMask = (PermMask & 0x0f) | 0x80;
12082 V2 = DAG.getUNDEF(VT);
12084 PermMask = (PermMask & 0xf0) | 0x08;
12086 PermMask = (PermMask & 0x0f) | 0x80;
12089 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12090 DAG.getConstant(PermMask, DL, MVT::i8));
12093 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12094 /// shuffling each lane.
12096 /// This will only succeed when the result of fixing the 128-bit lanes results
12097 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12098 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12099 /// the lane crosses early and then use simpler shuffles within each lane.
12101 /// FIXME: It might be worthwhile at some point to support this without
12102 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12103 /// in x86 only floating point has interesting non-repeating shuffles, and even
12104 /// those are still *marginally* more expensive.
12105 static SDValue lowerVectorShuffleByMerging128BitLanes(
12106 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12107 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12108 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12110 int Size = Mask.size();
12111 int LaneSize = 128 / VT.getScalarSizeInBits();
12112 int NumLanes = Size / LaneSize;
12113 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12115 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12116 // check whether the in-128-bit lane shuffles share a repeating pattern.
12117 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12118 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12119 for (int i = 0; i < Size; ++i) {
12123 int j = i / LaneSize;
12125 if (Lanes[j] < 0) {
12126 // First entry we've seen for this lane.
12127 Lanes[j] = Mask[i] / LaneSize;
12128 } else if (Lanes[j] != Mask[i] / LaneSize) {
12129 // This doesn't match the lane selected previously!
12133 // Check that within each lane we have a consistent shuffle mask.
12134 int k = i % LaneSize;
12135 if (InLaneMask[k] < 0) {
12136 InLaneMask[k] = Mask[i] % LaneSize;
12137 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12138 // This doesn't fit a repeating in-lane mask.
12143 // First shuffle the lanes into place.
12144 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12145 VT.getSizeInBits() / 64);
12146 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12147 for (int i = 0; i < NumLanes; ++i)
12148 if (Lanes[i] >= 0) {
12149 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12150 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12153 V1 = DAG.getBitcast(LaneVT, V1);
12154 V2 = DAG.getBitcast(LaneVT, V2);
12155 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12157 // Cast it back to the type we actually want.
12158 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12160 // Now do a simple shuffle that isn't lane crossing.
12161 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12162 for (int i = 0; i < Size; ++i)
12164 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12165 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12166 "Must not introduce lane crosses at this point!");
12168 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12171 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12172 /// This allows for fast cases such as subvector extraction/insertion
12173 /// or shuffling smaller vector types which can lower more efficiently.
12174 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12175 SDValue V1, SDValue V2,
12176 ArrayRef<int> Mask,
12177 const X86Subtarget &Subtarget,
12178 SelectionDAG &DAG) {
12179 assert(VT.is256BitVector() && "Expected 256-bit vector");
12181 unsigned NumElts = VT.getVectorNumElements();
12182 unsigned HalfNumElts = NumElts / 2;
12183 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12185 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12186 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12187 if (!UndefLower && !UndefUpper)
12190 // Upper half is undef and lower half is whole upper subvector.
12191 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12193 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12194 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12195 DAG.getIntPtrConstant(HalfNumElts, DL));
12196 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12197 DAG.getIntPtrConstant(0, DL));
12200 // Lower half is undef and upper half is whole lower subvector.
12201 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12203 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12204 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12205 DAG.getIntPtrConstant(0, DL));
12206 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12207 DAG.getIntPtrConstant(HalfNumElts, DL));
12210 // If the shuffle only uses two of the four halves of the input operands,
12211 // then extract them and perform the 'half' shuffle at half width.
12212 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12213 int HalfIdx1 = -1, HalfIdx2 = -1;
12214 SmallVector<int, 8> HalfMask(HalfNumElts);
12215 unsigned Offset = UndefLower ? HalfNumElts : 0;
12216 for (unsigned i = 0; i != HalfNumElts; ++i) {
12217 int M = Mask[i + Offset];
12223 // Determine which of the 4 half vectors this element is from.
12224 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12225 int HalfIdx = M / HalfNumElts;
12227 // Determine the element index into its half vector source.
12228 int HalfElt = M % HalfNumElts;
12230 // We can shuffle with up to 2 half vectors, set the new 'half'
12231 // shuffle mask accordingly.
12232 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12233 HalfMask[i] = HalfElt;
12234 HalfIdx1 = HalfIdx;
12237 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12238 HalfMask[i] = HalfElt + HalfNumElts;
12239 HalfIdx2 = HalfIdx;
12243 // Too many half vectors referenced.
12246 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12248 // Only shuffle the halves of the inputs when useful.
12249 int NumLowerHalves =
12250 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12251 int NumUpperHalves =
12252 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12254 // uuuuXXXX - don't extract uppers just to insert again.
12255 if (UndefLower && NumUpperHalves != 0)
12258 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12259 if (UndefUpper && NumUpperHalves == 2)
12262 // AVX2 - XXXXuuuu - always extract lowers.
12263 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12264 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12265 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12267 // AVX2 supports variable 32-bit element cross-lane shuffles.
12268 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12269 // XXXXuuuu - don't extract lowers and uppers.
12270 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12275 auto GetHalfVector = [&](int HalfIdx) {
12277 return DAG.getUNDEF(HalfVT);
12278 SDValue V = (HalfIdx < 2 ? V1 : V2);
12279 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12280 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12281 DAG.getIntPtrConstant(HalfIdx, DL));
12284 SDValue Half1 = GetHalfVector(HalfIdx1);
12285 SDValue Half2 = GetHalfVector(HalfIdx2);
12286 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12287 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12288 DAG.getIntPtrConstant(Offset, DL));
12291 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12294 /// This returns true if the elements from a particular input are already in the
12295 /// slot required by the given mask and require no permutation.
12296 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12297 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12298 int Size = Mask.size();
12299 for (int i = 0; i < Size; ++i)
12300 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12306 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12307 /// every lane can be represented as the same repeating mask - allowing us to
12308 /// shuffle the sources with the repeating shuffle and then permute the result
12309 /// to the destination lanes.
12310 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12311 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12312 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12313 int NumElts = VT.getVectorNumElements();
12314 int NumLanes = VT.getSizeInBits() / 128;
12315 int NumLaneElts = NumElts / NumLanes;
12317 // On AVX2 we may be able to just shuffle the lowest elements and then
12318 // broadcast the result.
12319 if (Subtarget.hasAVX2()) {
12320 for (unsigned BroadcastSize : {16, 32, 64}) {
12321 if (BroadcastSize <= VT.getScalarSizeInBits())
12323 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12325 // Attempt to match a repeating pattern every NumBroadcastElts,
12326 // accounting for UNDEFs but only references the lowest 128-bit
12327 // lane of the inputs.
12328 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12329 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12330 for (int j = 0; j != NumBroadcastElts; ++j) {
12331 int M = Mask[i + j];
12334 int &R = RepeatMask[j];
12335 if (0 != ((M % NumElts) / NumLaneElts))
12337 if (0 <= R && R != M)
12344 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12345 if (!FindRepeatingBroadcastMask(RepeatMask))
12348 // Shuffle the (lowest) repeated elements in place for broadcast.
12349 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12351 // Shuffle the actual broadcast.
12352 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12353 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12354 for (int j = 0; j != NumBroadcastElts; ++j)
12355 BroadcastMask[i + j] = j;
12356 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12361 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12362 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12365 // Bail if we already have a repeated lane shuffle mask.
12366 SmallVector<int, 8> RepeatedShuffleMask;
12367 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12370 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12371 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12372 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12373 int NumSubLanes = NumLanes * SubLaneScale;
12374 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12376 // Check that all the sources are coming from the same lane and see if we can
12377 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12378 // determine the source sub-lane for each destination sub-lane.
12379 int TopSrcSubLane = -1;
12380 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12381 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12382 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12383 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12385 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12386 // Extract the sub-lane mask, check that it all comes from the same lane
12387 // and normalize the mask entries to come from the first lane.
12389 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12390 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12391 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12394 int Lane = (M % NumElts) / NumLaneElts;
12395 if ((0 <= SrcLane) && (SrcLane != Lane))
12398 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12399 SubLaneMask[Elt] = LocalM;
12402 // Whole sub-lane is UNDEF.
12406 // Attempt to match against the candidate repeated sub-lane masks.
12407 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12408 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12409 for (int i = 0; i != NumSubLaneElts; ++i) {
12410 if (M1[i] < 0 || M2[i] < 0)
12412 if (M1[i] != M2[i])
12418 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12419 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12422 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12423 for (int i = 0; i != NumSubLaneElts; ++i) {
12424 int M = SubLaneMask[i];
12427 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12428 "Unexpected mask element");
12429 RepeatedSubLaneMask[i] = M;
12432 // Track the top most source sub-lane - by setting the remaining to UNDEF
12433 // we can greatly simplify shuffle matching.
12434 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12435 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12436 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12440 // Bail if we failed to find a matching repeated sub-lane mask.
12441 if (Dst2SrcSubLanes[DstSubLane] < 0)
12444 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12445 "Unexpected source lane");
12447 // Create a repeating shuffle mask for the entire vector.
12448 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12449 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12450 int Lane = SubLane / SubLaneScale;
12451 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12452 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12453 int M = RepeatedSubLaneMask[Elt];
12456 int Idx = (SubLane * NumSubLaneElts) + Elt;
12457 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12460 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12462 // Shuffle each source sub-lane to its destination.
12463 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12464 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12465 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12466 if (SrcSubLane < 0)
12468 for (int j = 0; j != NumSubLaneElts; ++j)
12469 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12472 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12476 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12477 unsigned &ShuffleImm,
12478 ArrayRef<int> Mask) {
12479 int NumElts = VT.getVectorNumElements();
12480 assert(VT.getScalarSizeInBits() == 64 &&
12481 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12482 "Unexpected data type for VSHUFPD");
12484 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12485 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12487 bool ShufpdMask = true;
12488 bool CommutableMask = true;
12489 for (int i = 0; i < NumElts; ++i) {
12490 if (Mask[i] == SM_SentinelUndef)
12494 int Val = (i & 6) + NumElts * (i & 1);
12495 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12496 if (Mask[i] < Val || Mask[i] > Val + 1)
12497 ShufpdMask = false;
12498 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12499 CommutableMask = false;
12500 ShuffleImm |= (Mask[i] % 2) << i;
12505 if (CommutableMask) {
12513 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12514 ArrayRef<int> Mask, SDValue V1,
12515 SDValue V2, SelectionDAG &DAG) {
12516 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12517 "Unexpected data type for VSHUFPD");
12519 unsigned Immediate = 0;
12520 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12523 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12524 DAG.getConstant(Immediate, DL, MVT::i8));
12527 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12528 ArrayRef<int> Mask, SDValue V1,
12529 SDValue V2, SelectionDAG &DAG) {
12530 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12531 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12533 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12535 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12537 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12540 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12542 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12543 /// isn't available.
12544 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12545 const APInt &Zeroable,
12546 SDValue V1, SDValue V2,
12547 const X86Subtarget &Subtarget,
12548 SelectionDAG &DAG) {
12549 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12550 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12551 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12553 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12554 Zeroable, Subtarget, DAG))
12557 if (V2.isUndef()) {
12558 // Check for being able to broadcast a single element.
12559 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12560 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12563 // Use low duplicate instructions for masks that match their pattern.
12564 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12565 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12567 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12568 // Non-half-crossing single input shuffles can be lowered with an
12569 // interleaved permutation.
12570 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12571 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12572 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12573 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12576 // With AVX2 we have direct support for this permutation.
12577 if (Subtarget.hasAVX2())
12578 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12579 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12581 // Try to create an in-lane repeating shuffle mask and then shuffle the
12582 // the results into the target lanes.
12583 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12584 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12587 // Otherwise, fall back.
12588 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12592 // Use dedicated unpack instructions for masks that match their pattern.
12594 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12597 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12598 Zeroable, Subtarget, DAG))
12601 // Check if the blend happens to exactly fit that of SHUFPD.
12603 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12606 // Try to create an in-lane repeating shuffle mask and then shuffle the
12607 // the results into the target lanes.
12608 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12609 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12612 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12613 // shuffle. However, if we have AVX2 and either inputs are already in place,
12614 // we will be able to shuffle even across lanes the other input in a single
12615 // instruction so skip this pattern.
12616 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12617 isShuffleMaskInputInPlace(1, Mask))))
12618 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12619 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12621 // If we have VLX support, we can use VEXPAND.
12622 if (Subtarget.hasVLX())
12623 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12624 V1, V2, DAG, Subtarget))
12627 // If we have AVX2 then we always want to lower with a blend because an v4 we
12628 // can fully permute the elements.
12629 if (Subtarget.hasAVX2())
12630 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12633 // Otherwise fall back on generic lowering.
12634 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12637 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12639 /// This routine is only called when we have AVX2 and thus a reasonable
12640 /// instruction set for v4i64 shuffling..
12641 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12642 const APInt &Zeroable,
12643 SDValue V1, SDValue V2,
12644 const X86Subtarget &Subtarget,
12645 SelectionDAG &DAG) {
12646 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12647 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12648 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12649 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12651 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12652 Zeroable, Subtarget, DAG))
12655 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12656 Zeroable, Subtarget, DAG))
12659 // Check for being able to broadcast a single element.
12660 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12661 Mask, Subtarget, DAG))
12664 if (V2.isUndef()) {
12665 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12666 // can use lower latency instructions that will operate on both lanes.
12667 SmallVector<int, 2> RepeatedMask;
12668 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12669 SmallVector<int, 4> PSHUFDMask;
12670 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12671 return DAG.getBitcast(
12673 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12674 DAG.getBitcast(MVT::v8i32, V1),
12675 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12678 // AVX2 provides a direct instruction for permuting a single input across
12680 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12681 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12684 // Try to use shift instructions.
12685 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12686 Zeroable, Subtarget, DAG))
12689 // If we have VLX support, we can use VALIGN or VEXPAND.
12690 if (Subtarget.hasVLX()) {
12691 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12692 Mask, Subtarget, DAG))
12695 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12696 V1, V2, DAG, Subtarget))
12700 // Try to use PALIGNR.
12701 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12702 Mask, Subtarget, DAG))
12705 // Use dedicated unpack instructions for masks that match their pattern.
12707 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12710 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12711 // shuffle. However, if we have AVX2 and either inputs are already in place,
12712 // we will be able to shuffle even across lanes the other input in a single
12713 // instruction so skip this pattern.
12714 if (!isShuffleMaskInputInPlace(0, Mask) &&
12715 !isShuffleMaskInputInPlace(1, Mask))
12716 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12717 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12720 // Otherwise fall back on generic blend lowering.
12721 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12725 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12727 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12728 /// isn't available.
12729 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12730 const APInt &Zeroable,
12731 SDValue V1, SDValue V2,
12732 const X86Subtarget &Subtarget,
12733 SelectionDAG &DAG) {
12734 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12735 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12736 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12738 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12739 Zeroable, Subtarget, DAG))
12742 // Check for being able to broadcast a single element.
12743 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12744 Mask, Subtarget, DAG))
12747 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12748 // options to efficiently lower the shuffle.
12749 SmallVector<int, 4> RepeatedMask;
12750 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12751 assert(RepeatedMask.size() == 4 &&
12752 "Repeated masks must be half the mask width!");
12754 // Use even/odd duplicate instructions for masks that match their pattern.
12755 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12756 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12757 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12758 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12761 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12762 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12764 // Use dedicated unpack instructions for masks that match their pattern.
12766 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12769 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12770 // have already handled any direct blends.
12771 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12774 // Try to create an in-lane repeating shuffle mask and then shuffle the
12775 // the results into the target lanes.
12776 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12777 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12780 // If we have a single input shuffle with different shuffle patterns in the
12781 // two 128-bit lanes use the variable mask to VPERMILPS.
12782 if (V2.isUndef()) {
12783 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12784 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12785 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12787 if (Subtarget.hasAVX2())
12788 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12790 // Otherwise, fall back.
12791 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12795 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12797 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12798 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12800 // If we have VLX support, we can use VEXPAND.
12801 if (Subtarget.hasVLX())
12802 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12803 V1, V2, DAG, Subtarget))
12806 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12807 // since after split we get a more efficient code using vpunpcklwd and
12808 // vpunpckhwd instrs than vblend.
12809 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12810 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12814 // If we have AVX2 then we always want to lower with a blend because at v8 we
12815 // can fully permute the elements.
12816 if (Subtarget.hasAVX2())
12817 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12820 // Otherwise fall back on generic lowering.
12821 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12824 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12826 /// This routine is only called when we have AVX2 and thus a reasonable
12827 /// instruction set for v8i32 shuffling..
12828 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12829 const APInt &Zeroable,
12830 SDValue V1, SDValue V2,
12831 const X86Subtarget &Subtarget,
12832 SelectionDAG &DAG) {
12833 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12834 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12835 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12836 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12838 // Whenever we can lower this as a zext, that instruction is strictly faster
12839 // than any alternative. It also allows us to fold memory operands into the
12840 // shuffle in many cases.
12841 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12842 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12845 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12846 // since after split we get a more efficient code than vblend by using
12847 // vpunpcklwd and vpunpckhwd instrs.
12848 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12849 !Subtarget.hasAVX512())
12851 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12854 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12855 Zeroable, Subtarget, DAG))
12858 // Check for being able to broadcast a single element.
12859 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12860 Mask, Subtarget, DAG))
12863 // If the shuffle mask is repeated in each 128-bit lane we can use more
12864 // efficient instructions that mirror the shuffles across the two 128-bit
12866 SmallVector<int, 4> RepeatedMask;
12867 bool Is128BitLaneRepeatedShuffle =
12868 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12869 if (Is128BitLaneRepeatedShuffle) {
12870 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12872 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12873 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12875 // Use dedicated unpack instructions for masks that match their pattern.
12877 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12881 // Try to use shift instructions.
12882 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12883 Zeroable, Subtarget, DAG))
12886 // If we have VLX support, we can use VALIGN or EXPAND.
12887 if (Subtarget.hasVLX()) {
12888 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12889 Mask, Subtarget, DAG))
12892 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12893 V1, V2, DAG, Subtarget))
12897 // Try to use byte rotation instructions.
12898 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12899 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12902 // Try to create an in-lane repeating shuffle mask and then shuffle the
12903 // results into the target lanes.
12904 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12905 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12908 // If the shuffle patterns aren't repeated but it is a single input, directly
12909 // generate a cross-lane VPERMD instruction.
12910 if (V2.isUndef()) {
12911 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12912 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12915 // Assume that a single SHUFPS is faster than an alternative sequence of
12916 // multiple instructions (even if the CPU has a domain penalty).
12917 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12918 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12919 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12920 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12921 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12922 CastV1, CastV2, DAG);
12923 return DAG.getBitcast(MVT::v8i32, ShufPS);
12926 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12928 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12929 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12932 // Otherwise fall back on generic blend lowering.
12933 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12937 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12939 /// This routine is only called when we have AVX2 and thus a reasonable
12940 /// instruction set for v16i16 shuffling..
12941 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12942 const APInt &Zeroable,
12943 SDValue V1, SDValue V2,
12944 const X86Subtarget &Subtarget,
12945 SelectionDAG &DAG) {
12946 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12947 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12948 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12949 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12951 // Whenever we can lower this as a zext, that instruction is strictly faster
12952 // than any alternative. It also allows us to fold memory operands into the
12953 // shuffle in many cases.
12954 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12955 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12958 // Check for being able to broadcast a single element.
12959 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12960 Mask, Subtarget, DAG))
12963 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12964 Zeroable, Subtarget, DAG))
12967 // Use dedicated unpack instructions for masks that match their pattern.
12969 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12972 // Try to use shift instructions.
12973 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12974 Zeroable, Subtarget, DAG))
12977 // Try to use byte rotation instructions.
12978 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12979 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12982 // Try to create an in-lane repeating shuffle mask and then shuffle the
12983 // the results into the target lanes.
12984 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12985 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12988 if (V2.isUndef()) {
12989 // There are no generalized cross-lane shuffle operations available on i16
12991 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12992 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12995 SmallVector<int, 8> RepeatedMask;
12996 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12997 // As this is a single-input shuffle, the repeated mask should be
12998 // a strictly valid v8i16 mask that we can pass through to the v8i16
12999 // lowering to handle even the v16 case.
13000 return lowerV8I16GeneralSingleInputVectorShuffle(
13001 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13005 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13006 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13009 // AVX512BWVL can lower to VPERMW.
13010 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13011 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13013 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13015 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13016 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13019 // Otherwise fall back on generic lowering.
13020 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13023 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13025 /// This routine is only called when we have AVX2 and thus a reasonable
13026 /// instruction set for v32i8 shuffling..
13027 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13028 const APInt &Zeroable,
13029 SDValue V1, SDValue V2,
13030 const X86Subtarget &Subtarget,
13031 SelectionDAG &DAG) {
13032 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13033 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13034 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13035 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13037 // Whenever we can lower this as a zext, that instruction is strictly faster
13038 // than any alternative. It also allows us to fold memory operands into the
13039 // shuffle in many cases.
13040 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13041 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13044 // Check for being able to broadcast a single element.
13045 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13046 Mask, Subtarget, DAG))
13049 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13050 Zeroable, Subtarget, DAG))
13053 // Use dedicated unpack instructions for masks that match their pattern.
13055 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13058 // Try to use shift instructions.
13059 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13060 Zeroable, Subtarget, DAG))
13063 // Try to use byte rotation instructions.
13064 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13065 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13068 // Try to create an in-lane repeating shuffle mask and then shuffle the
13069 // the results into the target lanes.
13070 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13071 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13074 // There are no generalized cross-lane shuffle operations available on i8
13076 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13077 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13080 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13081 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13084 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13086 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13087 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13090 // Otherwise fall back on generic lowering.
13091 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13094 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13096 /// This routine either breaks down the specific type of a 256-bit x86 vector
13097 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13098 /// together based on the available instructions.
13099 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13100 MVT VT, SDValue V1, SDValue V2,
13101 const APInt &Zeroable,
13102 const X86Subtarget &Subtarget,
13103 SelectionDAG &DAG) {
13104 // If we have a single input to the zero element, insert that into V1 if we
13105 // can do so cheaply.
13106 int NumElts = VT.getVectorNumElements();
13107 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13109 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13110 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13111 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13114 // Handle special cases where the lower or upper half is UNDEF.
13116 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13119 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13120 // can check for those subtargets here and avoid much of the subtarget
13121 // querying in the per-vector-type lowering routines. With AVX1 we have
13122 // essentially *zero* ability to manipulate a 256-bit vector with integer
13123 // types. Since we'll use floating point types there eventually, just
13124 // immediately cast everything to a float and operate entirely in that domain.
13125 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13126 int ElementBits = VT.getScalarSizeInBits();
13127 if (ElementBits < 32) {
13128 // No floating point type available, if we can't use the bit operations
13129 // for masking/blending then decompose into 128-bit vectors.
13131 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13133 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13135 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13138 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13139 VT.getVectorNumElements());
13140 V1 = DAG.getBitcast(FpVT, V1);
13141 V2 = DAG.getBitcast(FpVT, V2);
13142 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13145 switch (VT.SimpleTy) {
13147 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13149 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13151 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13153 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13155 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13157 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13160 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13164 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13165 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13166 ArrayRef<int> Mask, SDValue V1,
13167 SDValue V2, SelectionDAG &DAG) {
13168 assert(VT.getScalarSizeInBits() == 64 &&
13169 "Unexpected element type size for 128bit shuffle.");
13171 // To handle 256 bit vector requires VLX and most probably
13172 // function lowerV2X128VectorShuffle() is better solution.
13173 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13175 SmallVector<int, 4> WidenedMask;
13176 if (!canWidenShuffleElements(Mask, WidenedMask))
13179 // Check for patterns which can be matched with a single insert of a 256-bit
13181 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13182 {0, 1, 2, 3, 0, 1, 2, 3});
13183 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13184 {0, 1, 2, 3, 8, 9, 10, 11})) {
13185 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13186 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13187 DAG.getIntPtrConstant(0, DL));
13188 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13189 OnlyUsesV1 ? V1 : V2,
13190 DAG.getIntPtrConstant(0, DL));
13191 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13194 assert(WidenedMask.size() == 4);
13196 // See if this is an insertion of the lower 128-bits of V2 into V1.
13197 bool IsInsert = true;
13199 for (int i = 0; i < 4; ++i) {
13200 assert(WidenedMask[i] >= -1);
13201 if (WidenedMask[i] < 0)
13204 // Make sure all V1 subvectors are in place.
13205 if (WidenedMask[i] < 4) {
13206 if (WidenedMask[i] != i) {
13211 // Make sure we only have a single V2 index and its the lowest 128-bits.
13212 if (V2Index >= 0 || WidenedMask[i] != 4) {
13219 if (IsInsert && V2Index >= 0) {
13220 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13221 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13222 DAG.getIntPtrConstant(0, DL));
13223 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13226 // Try to lower to to vshuf64x2/vshuf32x4.
13227 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13228 unsigned PermMask = 0;
13229 // Insure elements came from the same Op.
13230 for (int i = 0; i < 4; ++i) {
13231 assert(WidenedMask[i] >= -1);
13232 if (WidenedMask[i] < 0)
13235 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13236 unsigned OpIndex = i / 2;
13237 if (Ops[OpIndex].isUndef())
13239 else if (Ops[OpIndex] != Op)
13242 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13243 // bits defined by a vshuf64x2 instruction's immediate control byte.
13244 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13247 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13248 DAG.getConstant(PermMask, DL, MVT::i8));
13251 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13252 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13253 const APInt &Zeroable,
13254 SDValue V1, SDValue V2,
13255 const X86Subtarget &Subtarget,
13256 SelectionDAG &DAG) {
13257 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13258 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13259 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13261 if (V2.isUndef()) {
13262 // Use low duplicate instructions for masks that match their pattern.
13263 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13264 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13266 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13267 // Non-half-crossing single input shuffles can be lowered with an
13268 // interleaved permutation.
13269 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13270 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13271 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13272 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13273 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13274 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13277 SmallVector<int, 4> RepeatedMask;
13278 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13279 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13280 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13283 if (SDValue Shuf128 =
13284 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13287 if (SDValue Unpck =
13288 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13291 // Check if the blend happens to exactly fit that of SHUFPD.
13293 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13296 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13297 V2, DAG, Subtarget))
13300 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13301 Zeroable, Subtarget, DAG))
13304 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13307 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13308 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13309 const APInt &Zeroable,
13310 SDValue V1, SDValue V2,
13311 const X86Subtarget &Subtarget,
13312 SelectionDAG &DAG) {
13313 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13314 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13315 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13317 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13318 // options to efficiently lower the shuffle.
13319 SmallVector<int, 4> RepeatedMask;
13320 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13321 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13323 // Use even/odd duplicate instructions for masks that match their pattern.
13324 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13325 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13326 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13327 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13330 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13331 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13333 // Use dedicated unpack instructions for masks that match their pattern.
13334 if (SDValue Unpck =
13335 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13338 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13339 Zeroable, Subtarget, DAG))
13342 // Otherwise, fall back to a SHUFPS sequence.
13343 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13345 // If we have AVX512F support, we can use VEXPAND.
13346 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13347 V1, V2, DAG, Subtarget))
13350 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13353 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13354 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13355 const APInt &Zeroable,
13356 SDValue V1, SDValue V2,
13357 const X86Subtarget &Subtarget,
13358 SelectionDAG &DAG) {
13359 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13360 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13361 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13363 if (SDValue Shuf128 =
13364 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13367 if (V2.isUndef()) {
13368 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13369 // can use lower latency instructions that will operate on all four
13371 SmallVector<int, 2> Repeated128Mask;
13372 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13373 SmallVector<int, 4> PSHUFDMask;
13374 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13375 return DAG.getBitcast(
13377 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13378 DAG.getBitcast(MVT::v16i32, V1),
13379 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13382 SmallVector<int, 4> Repeated256Mask;
13383 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13384 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13385 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13388 // Try to use shift instructions.
13389 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13390 Zeroable, Subtarget, DAG))
13393 // Try to use VALIGN.
13394 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13395 Mask, Subtarget, DAG))
13398 // Try to use PALIGNR.
13399 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13400 Mask, Subtarget, DAG))
13403 if (SDValue Unpck =
13404 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13406 // If we have AVX512F support, we can use VEXPAND.
13407 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13408 V2, DAG, Subtarget))
13411 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13412 Zeroable, Subtarget, DAG))
13415 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13418 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13419 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13420 const APInt &Zeroable,
13421 SDValue V1, SDValue V2,
13422 const X86Subtarget &Subtarget,
13423 SelectionDAG &DAG) {
13424 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13425 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13426 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13428 // Whenever we can lower this as a zext, that instruction is strictly faster
13429 // than any alternative. It also allows us to fold memory operands into the
13430 // shuffle in many cases.
13431 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13432 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13435 // If the shuffle mask is repeated in each 128-bit lane we can use more
13436 // efficient instructions that mirror the shuffles across the four 128-bit
13438 SmallVector<int, 4> RepeatedMask;
13439 bool Is128BitLaneRepeatedShuffle =
13440 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13441 if (Is128BitLaneRepeatedShuffle) {
13442 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13444 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13447 // Use dedicated unpack instructions for masks that match their pattern.
13449 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13453 // Try to use shift instructions.
13454 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13455 Zeroable, Subtarget, DAG))
13458 // Try to use VALIGN.
13459 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13460 Mask, Subtarget, DAG))
13463 // Try to use byte rotation instructions.
13464 if (Subtarget.hasBWI())
13465 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13466 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13469 // Assume that a single SHUFPS is faster than using a permv shuffle.
13470 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13471 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13472 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13473 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13474 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13475 CastV1, CastV2, DAG);
13476 return DAG.getBitcast(MVT::v16i32, ShufPS);
13478 // If we have AVX512F support, we can use VEXPAND.
13479 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13480 V1, V2, DAG, Subtarget))
13483 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13484 Zeroable, Subtarget, DAG))
13486 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13489 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13490 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13491 const APInt &Zeroable,
13492 SDValue V1, SDValue V2,
13493 const X86Subtarget &Subtarget,
13494 SelectionDAG &DAG) {
13495 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13496 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13497 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13498 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13500 // Whenever we can lower this as a zext, that instruction is strictly faster
13501 // than any alternative. It also allows us to fold memory operands into the
13502 // shuffle in many cases.
13503 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13504 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13507 // Use dedicated unpack instructions for masks that match their pattern.
13509 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13512 // Try to use shift instructions.
13513 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13514 Zeroable, Subtarget, DAG))
13517 // Try to use byte rotation instructions.
13518 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13519 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13522 if (V2.isUndef()) {
13523 SmallVector<int, 8> RepeatedMask;
13524 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13525 // As this is a single-input shuffle, the repeated mask should be
13526 // a strictly valid v8i16 mask that we can pass through to the v8i16
13527 // lowering to handle even the v32 case.
13528 return lowerV8I16GeneralSingleInputVectorShuffle(
13529 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13533 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13534 Zeroable, Subtarget, DAG))
13537 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13540 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13541 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13542 const APInt &Zeroable,
13543 SDValue V1, SDValue V2,
13544 const X86Subtarget &Subtarget,
13545 SelectionDAG &DAG) {
13546 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13547 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13548 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13549 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13551 // Whenever we can lower this as a zext, that instruction is strictly faster
13552 // than any alternative. It also allows us to fold memory operands into the
13553 // shuffle in many cases.
13554 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13555 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13558 // Use dedicated unpack instructions for masks that match their pattern.
13560 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13563 // Try to use shift instructions.
13564 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13565 Zeroable, Subtarget, DAG))
13568 // Try to use byte rotation instructions.
13569 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13570 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13573 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13574 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13577 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13578 if (Subtarget.hasVBMI())
13579 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13581 // Try to create an in-lane repeating shuffle mask and then shuffle the
13582 // the results into the target lanes.
13583 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13584 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13587 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13588 Zeroable, Subtarget, DAG))
13591 // FIXME: Implement direct support for this type!
13592 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13595 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13597 /// This routine either breaks down the specific type of a 512-bit x86 vector
13598 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13599 /// together based on the available instructions.
13600 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13601 MVT VT, SDValue V1, SDValue V2,
13602 const APInt &Zeroable,
13603 const X86Subtarget &Subtarget,
13604 SelectionDAG &DAG) {
13605 assert(Subtarget.hasAVX512() &&
13606 "Cannot lower 512-bit vectors w/ basic ISA!");
13608 // If we have a single input to the zero element, insert that into V1 if we
13609 // can do so cheaply.
13610 int NumElts = Mask.size();
13611 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13613 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13614 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13615 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13618 // Check for being able to broadcast a single element.
13619 if (SDValue Broadcast =
13620 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13623 // Dispatch to each element type for lowering. If we don't have support for
13624 // specific element type shuffles at 512 bits, immediately split them and
13625 // lower them. Each lowering routine of a given type is allowed to assume that
13626 // the requisite ISA extensions for that element type are available.
13627 switch (VT.SimpleTy) {
13629 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13631 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13633 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13635 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13637 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13639 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13642 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13646 // Lower vXi1 vector shuffles.
13647 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13648 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13649 // vector, shuffle and then truncate it back.
13650 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13651 MVT VT, SDValue V1, SDValue V2,
13652 const X86Subtarget &Subtarget,
13653 SelectionDAG &DAG) {
13654 assert(Subtarget.hasAVX512() &&
13655 "Cannot lower 512-bit vectors w/o basic ISA!");
13657 switch (VT.SimpleTy) {
13659 llvm_unreachable("Expected a vector of i1 elements");
13661 ExtVT = MVT::v2i64;
13664 ExtVT = MVT::v4i32;
13667 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13670 ExtVT = MVT::v16i32;
13673 ExtVT = MVT::v32i16;
13676 ExtVT = MVT::v64i8;
13680 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13681 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13682 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13683 V1 = getOnesVector(ExtVT, DAG, DL);
13685 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13688 V2 = DAG.getUNDEF(ExtVT);
13689 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13690 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13691 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13692 V2 = getOnesVector(ExtVT, DAG, DL);
13694 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13696 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13697 // i1 was sign extended we can use X86ISD::CVT2MASK.
13698 int NumElems = VT.getVectorNumElements();
13699 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13700 (Subtarget.hasDQI() && (NumElems < 32)))
13701 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13703 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13706 /// Helper function that returns true if the shuffle mask should be
13707 /// commuted to improve canonicalization.
13708 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13709 int NumElements = Mask.size();
13711 int NumV1Elements = 0, NumV2Elements = 0;
13715 else if (M < NumElements)
13720 // Commute the shuffle as needed such that more elements come from V1 than
13721 // V2. This allows us to match the shuffle pattern strictly on how many
13722 // elements come from V1 without handling the symmetric cases.
13723 if (NumV2Elements > NumV1Elements)
13726 assert(NumV1Elements > 0 && "No V1 indices");
13728 if (NumV2Elements == 0)
13731 // When the number of V1 and V2 elements are the same, try to minimize the
13732 // number of uses of V2 in the low half of the vector. When that is tied,
13733 // ensure that the sum of indices for V1 is equal to or lower than the sum
13734 // indices for V2. When those are equal, try to ensure that the number of odd
13735 // indices for V1 is lower than the number of odd indices for V2.
13736 if (NumV1Elements == NumV2Elements) {
13737 int LowV1Elements = 0, LowV2Elements = 0;
13738 for (int M : Mask.slice(0, NumElements / 2))
13739 if (M >= NumElements)
13743 if (LowV2Elements > LowV1Elements)
13745 if (LowV2Elements == LowV1Elements) {
13746 int SumV1Indices = 0, SumV2Indices = 0;
13747 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13748 if (Mask[i] >= NumElements)
13750 else if (Mask[i] >= 0)
13752 if (SumV2Indices < SumV1Indices)
13754 if (SumV2Indices == SumV1Indices) {
13755 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13756 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13757 if (Mask[i] >= NumElements)
13758 NumV2OddIndices += i % 2;
13759 else if (Mask[i] >= 0)
13760 NumV1OddIndices += i % 2;
13761 if (NumV2OddIndices < NumV1OddIndices)
13770 /// \brief Top-level lowering for x86 vector shuffles.
13772 /// This handles decomposition, canonicalization, and lowering of all x86
13773 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13774 /// above in helper routines. The canonicalization attempts to widen shuffles
13775 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13776 /// s.t. only one of the two inputs needs to be tested, etc.
13777 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13778 SelectionDAG &DAG) {
13779 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13780 ArrayRef<int> Mask = SVOp->getMask();
13781 SDValue V1 = Op.getOperand(0);
13782 SDValue V2 = Op.getOperand(1);
13783 MVT VT = Op.getSimpleValueType();
13784 int NumElements = VT.getVectorNumElements();
13786 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13788 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13789 "Can't lower MMX shuffles");
13791 bool V1IsUndef = V1.isUndef();
13792 bool V2IsUndef = V2.isUndef();
13793 if (V1IsUndef && V2IsUndef)
13794 return DAG.getUNDEF(VT);
13796 // When we create a shuffle node we put the UNDEF node to second operand,
13797 // but in some cases the first operand may be transformed to UNDEF.
13798 // In this case we should just commute the node.
13800 return DAG.getCommutedVectorShuffle(*SVOp);
13802 // Check for non-undef masks pointing at an undef vector and make the masks
13803 // undef as well. This makes it easier to match the shuffle based solely on
13807 if (M >= NumElements) {
13808 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13809 for (int &M : NewMask)
13810 if (M >= NumElements)
13812 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13815 // Check for illegal shuffle mask element index values.
13816 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13817 assert(llvm::all_of(Mask,
13818 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13819 "Out of bounds shuffle index");
13821 // We actually see shuffles that are entirely re-arrangements of a set of
13822 // zero inputs. This mostly happens while decomposing complex shuffles into
13823 // simple ones. Directly lower these as a buildvector of zeros.
13824 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13825 if (Zeroable.isAllOnesValue())
13826 return getZeroVector(VT, Subtarget, DAG, DL);
13828 // Try to collapse shuffles into using a vector type with fewer elements but
13829 // wider element types. We cap this to not form integers or floating point
13830 // elements wider than 64 bits, but it might be interesting to form i128
13831 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13832 SmallVector<int, 16> WidenedMask;
13833 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13834 canWidenShuffleElements(Mask, WidenedMask)) {
13835 MVT NewEltVT = VT.isFloatingPoint()
13836 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13837 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13838 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13839 // Make sure that the new vector type is legal. For example, v2f64 isn't
13841 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13842 V1 = DAG.getBitcast(NewVT, V1);
13843 V2 = DAG.getBitcast(NewVT, V2);
13844 return DAG.getBitcast(
13845 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13849 // Commute the shuffle if it will improve canonicalization.
13850 if (canonicalizeShuffleMaskWithCommute(Mask))
13851 return DAG.getCommutedVectorShuffle(*SVOp);
13853 // For each vector width, delegate to a specialized lowering routine.
13854 if (VT.is128BitVector())
13855 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13858 if (VT.is256BitVector())
13859 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13862 if (VT.is512BitVector())
13863 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13867 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13869 llvm_unreachable("Unimplemented!");
13872 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13873 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13874 const X86Subtarget &Subtarget,
13875 SelectionDAG &DAG) {
13876 SDValue Cond = Op.getOperand(0);
13877 SDValue LHS = Op.getOperand(1);
13878 SDValue RHS = Op.getOperand(2);
13880 MVT VT = Op.getSimpleValueType();
13882 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13884 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13886 // Only non-legal VSELECTs reach this lowering, convert those into generic
13887 // shuffles and re-use the shuffle lowering path for blends.
13888 SmallVector<int, 32> Mask;
13889 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13890 SDValue CondElt = CondBV->getOperand(i);
13892 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13895 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13898 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13899 // A vselect where all conditions and data are constants can be optimized into
13900 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13901 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13902 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13903 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13906 // Try to lower this to a blend-style vector shuffle. This can handle all
13907 // constant condition cases.
13908 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13911 // Variable blends are only legal from SSE4.1 onward.
13912 if (!Subtarget.hasSSE41())
13915 // Only some types will be legal on some subtargets. If we can emit a legal
13916 // VSELECT-matching blend, return Op, and but if we need to expand, return
13918 switch (Op.getSimpleValueType().SimpleTy) {
13920 // Most of the vector types have blends past SSE4.1.
13924 // The byte blends for AVX vectors were introduced only in AVX2.
13925 if (Subtarget.hasAVX2())
13932 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13933 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13936 // FIXME: We should custom lower this by fixing the condition and using i8
13942 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13943 MVT VT = Op.getSimpleValueType();
13946 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13949 if (VT.getSizeInBits() == 8) {
13950 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13951 Op.getOperand(0), Op.getOperand(1));
13952 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13953 DAG.getValueType(VT));
13954 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13957 if (VT == MVT::f32) {
13958 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13959 // the result back to FR32 register. It's only worth matching if the
13960 // result has a single use which is a store or a bitcast to i32. And in
13961 // the case of a store, it's not worth it if the index is a constant 0,
13962 // because a MOVSSmr can be used instead, which is smaller and faster.
13963 if (!Op.hasOneUse())
13965 SDNode *User = *Op.getNode()->use_begin();
13966 if ((User->getOpcode() != ISD::STORE ||
13967 isNullConstant(Op.getOperand(1))) &&
13968 (User->getOpcode() != ISD::BITCAST ||
13969 User->getValueType(0) != MVT::i32))
13971 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13972 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13974 return DAG.getBitcast(MVT::f32, Extract);
13977 if (VT == MVT::i32 || VT == MVT::i64) {
13978 // ExtractPS/pextrq works with constant index.
13979 if (isa<ConstantSDNode>(Op.getOperand(1)))
13986 /// Extract one bit from mask vector, like v16i1 or v8i1.
13987 /// AVX-512 feature.
13989 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13990 SDValue Vec = Op.getOperand(0);
13992 MVT VecVT = Vec.getSimpleValueType();
13993 SDValue Idx = Op.getOperand(1);
13994 MVT EltVT = Op.getSimpleValueType();
13996 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13997 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13998 "Unexpected vector type in ExtractBitFromMaskVector");
14000 // variable index can't be handled in mask registers,
14001 // extend vector to VR512/128
14002 if (!isa<ConstantSDNode>(Idx)) {
14003 unsigned NumElts = VecVT.getVectorNumElements();
14004 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14005 // than extending to 128/256bit.
14006 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14007 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14008 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14009 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14010 ExtVT.getVectorElementType(), Ext, Idx);
14011 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14014 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14015 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14016 (VecVT.getVectorNumElements() < 8)) {
14017 // Use kshiftlw/rw instruction.
14018 VecVT = MVT::v16i1;
14019 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14020 DAG.getUNDEF(VecVT),
14022 DAG.getIntPtrConstant(0, dl));
14024 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14025 if (MaxSift - IdxVal)
14026 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14027 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14028 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14029 DAG.getConstant(MaxSift, dl, MVT::i8));
14030 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
14031 DAG.getIntPtrConstant(0, dl));
14035 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14036 SelectionDAG &DAG) const {
14038 SDValue Vec = Op.getOperand(0);
14039 MVT VecVT = Vec.getSimpleValueType();
14040 SDValue Idx = Op.getOperand(1);
14042 if (Op.getSimpleValueType() == MVT::i1)
14043 return ExtractBitFromMaskVector(Op, DAG);
14045 if (!isa<ConstantSDNode>(Idx)) {
14046 // Its more profitable to go through memory (1 cycles throughput)
14047 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14048 // IACA tool was used to get performance estimation
14049 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14051 // example : extractelement <16 x i8> %a, i32 %i
14053 // Block Throughput: 3.00 Cycles
14054 // Throughput Bottleneck: Port5
14056 // | Num Of | Ports pressure in cycles | |
14057 // | Uops | 0 - DV | 5 | 6 | 7 | |
14058 // ---------------------------------------------
14059 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14060 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14061 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14062 // Total Num Of Uops: 4
14065 // Block Throughput: 1.00 Cycles
14066 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14068 // | | Ports pressure in cycles | |
14069 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14070 // ---------------------------------------------------------
14071 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14072 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14073 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14074 // Total Num Of Uops: 4
14079 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14081 // If this is a 256-bit vector result, first extract the 128-bit vector and
14082 // then extract the element from the 128-bit vector.
14083 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14084 // Get the 128-bit vector.
14085 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14086 MVT EltVT = VecVT.getVectorElementType();
14088 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14089 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14091 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14092 // this can be done with a mask.
14093 IdxVal &= ElemsPerChunk - 1;
14094 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14095 DAG.getConstant(IdxVal, dl, MVT::i32));
14098 assert(VecVT.is128BitVector() && "Unexpected vector length");
14100 MVT VT = Op.getSimpleValueType();
14102 if (VT.getSizeInBits() == 16) {
14103 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14104 // we're going to zero extend the register or fold the store (SSE41 only).
14105 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14106 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14107 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14108 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14109 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14111 // Transform it so it match pextrw which produces a 32-bit result.
14112 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14113 Op.getOperand(0), Op.getOperand(1));
14114 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14115 DAG.getValueType(VT));
14116 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14119 if (Subtarget.hasSSE41())
14120 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14123 // TODO: We only extract a single element from v16i8, we can probably afford
14124 // to be more aggressive here before using the default approach of spilling to
14126 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14127 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14128 int DWordIdx = IdxVal / 4;
14129 if (DWordIdx == 0) {
14130 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14131 DAG.getBitcast(MVT::v4i32, Vec),
14132 DAG.getIntPtrConstant(DWordIdx, dl));
14133 int ShiftVal = (IdxVal % 4) * 8;
14135 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14136 DAG.getConstant(ShiftVal, dl, MVT::i32));
14137 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14140 int WordIdx = IdxVal / 2;
14141 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14142 DAG.getBitcast(MVT::v8i16, Vec),
14143 DAG.getIntPtrConstant(WordIdx, dl));
14144 int ShiftVal = (IdxVal % 2) * 8;
14146 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14147 DAG.getConstant(ShiftVal, dl, MVT::i16));
14148 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14151 if (VT.getSizeInBits() == 32) {
14155 // SHUFPS the element to the lowest double word, then movss.
14156 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14157 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14158 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14159 DAG.getIntPtrConstant(0, dl));
14162 if (VT.getSizeInBits() == 64) {
14163 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14164 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14165 // to match extract_elt for f64.
14169 // UNPCKHPD the element to the lowest double word, then movsd.
14170 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14171 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14172 int Mask[2] = { 1, -1 };
14173 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14174 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14175 DAG.getIntPtrConstant(0, dl));
14181 /// Insert one bit to mask vector, like v16i1 or v8i1.
14182 /// AVX-512 feature.
14184 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14186 SDValue Vec = Op.getOperand(0);
14187 SDValue Elt = Op.getOperand(1);
14188 SDValue Idx = Op.getOperand(2);
14189 MVT VecVT = Vec.getSimpleValueType();
14191 if (!isa<ConstantSDNode>(Idx)) {
14192 // Non constant index. Extend source and destination,
14193 // insert element and then truncate the result.
14194 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14195 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14196 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14197 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14198 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14199 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14202 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14203 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14204 unsigned NumElems = VecVT.getVectorNumElements();
14206 if(Vec.isUndef()) {
14208 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14209 DAG.getConstant(IdxVal, dl, MVT::i8));
14213 // Insertion of one bit into first or last position
14214 // can be done with two SHIFTs + OR.
14215 if (IdxVal == 0 ) {
14216 // EltInVec already at correct index and other bits are 0.
14217 // Clean the first bit in source vector.
14218 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14219 DAG.getConstant(1 , dl, MVT::i8));
14220 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14221 DAG.getConstant(1, dl, MVT::i8));
14223 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14225 if (IdxVal == NumElems -1) {
14226 // Move the bit to the last position inside the vector.
14227 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14228 DAG.getConstant(IdxVal, dl, MVT::i8));
14229 // Clean the last bit in the source vector.
14230 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14231 DAG.getConstant(1, dl, MVT::i8));
14232 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14233 DAG.getConstant(1 , dl, MVT::i8));
14235 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14238 // Use shuffle to insert element.
14239 SmallVector<int, 64> MaskVec(NumElems);
14240 for (unsigned i = 0; i != NumElems; ++i)
14241 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14243 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14246 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14247 SelectionDAG &DAG) const {
14248 MVT VT = Op.getSimpleValueType();
14249 MVT EltVT = VT.getVectorElementType();
14250 unsigned NumElts = VT.getVectorNumElements();
14252 if (EltVT == MVT::i1)
14253 return InsertBitToMaskVector(Op, DAG);
14256 SDValue N0 = Op.getOperand(0);
14257 SDValue N1 = Op.getOperand(1);
14258 SDValue N2 = Op.getOperand(2);
14259 if (!isa<ConstantSDNode>(N2))
14261 auto *N2C = cast<ConstantSDNode>(N2);
14262 unsigned IdxVal = N2C->getZExtValue();
14264 bool IsZeroElt = X86::isZeroNode(N1);
14265 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14267 // If we are inserting a element, see if we can do this more efficiently with
14268 // a blend shuffle with a rematerializable vector than a costly integer
14270 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14271 // be beneficial if we are inserting several zeros and can combine the masks.
14272 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14273 SmallVector<int, 8> BlendMask;
14274 for (unsigned i = 0; i != NumElts; ++i)
14275 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14276 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14277 : DAG.getConstant(-1, dl, VT);
14278 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14281 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14282 // into that, and then insert the subvector back into the result.
14283 if (VT.is256BitVector() || VT.is512BitVector()) {
14284 // With a 256-bit vector, we can insert into the zero element efficiently
14285 // using a blend if we have AVX or AVX2 and the right data type.
14286 if (VT.is256BitVector() && IdxVal == 0) {
14287 // TODO: It is worthwhile to cast integer to floating point and back
14288 // and incur a domain crossing penalty if that's what we'll end up
14289 // doing anyway after extracting to a 128-bit vector.
14290 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14291 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14292 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14293 N2 = DAG.getIntPtrConstant(1, dl);
14294 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14298 // Get the desired 128-bit vector chunk.
14299 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14301 // Insert the element into the desired chunk.
14302 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14303 assert(isPowerOf2_32(NumEltsIn128));
14304 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14305 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14307 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14308 DAG.getConstant(IdxIn128, dl, MVT::i32));
14310 // Insert the changed part back into the bigger vector
14311 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14313 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14315 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14316 // argument. SSE41 required for pinsrb.
14317 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14319 if (VT == MVT::v8i16) {
14320 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14321 Opc = X86ISD::PINSRW;
14323 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14324 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14325 Opc = X86ISD::PINSRB;
14328 if (N1.getValueType() != MVT::i32)
14329 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14330 if (N2.getValueType() != MVT::i32)
14331 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14332 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14335 if (Subtarget.hasSSE41()) {
14336 if (EltVT == MVT::f32) {
14337 // Bits [7:6] of the constant are the source select. This will always be
14338 // zero here. The DAG Combiner may combine an extract_elt index into
14339 // these bits. For example (insert (extract, 3), 2) could be matched by
14340 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14341 // Bits [5:4] of the constant are the destination select. This is the
14342 // value of the incoming immediate.
14343 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14344 // combine either bitwise AND or insert of float 0.0 to set these bits.
14346 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14347 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14348 // If this is an insertion of 32-bits into the low 32-bits of
14349 // a vector, we prefer to generate a blend with immediate rather
14350 // than an insertps. Blends are simpler operations in hardware and so
14351 // will always have equal or better performance than insertps.
14352 // But if optimizing for size and there's a load folding opportunity,
14353 // generate insertps because blendps does not have a 32-bit memory
14355 N2 = DAG.getIntPtrConstant(1, dl);
14356 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14357 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14359 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14360 // Create this as a scalar to vector..
14361 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14362 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14365 // PINSR* works with constant index.
14366 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14373 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14374 SelectionDAG &DAG) {
14376 MVT OpVT = Op.getSimpleValueType();
14378 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14380 if (X86::isZeroNode(Op.getOperand(0)))
14381 return getZeroVector(OpVT, Subtarget, DAG, dl);
14383 // If this is a 256-bit vector result, first insert into a 128-bit
14384 // vector and then insert into the 256-bit vector.
14385 if (!OpVT.is128BitVector()) {
14386 // Insert into a 128-bit vector.
14387 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14388 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14389 OpVT.getVectorNumElements() / SizeFactor);
14391 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14393 // Insert the 128-bit vector.
14394 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14396 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14398 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14399 if (OpVT == MVT::v4i32)
14402 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14403 return DAG.getBitcast(
14404 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14407 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14408 // a simple subregister reference or explicit instructions to grab
14409 // upper bits of a vector.
14410 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14411 SelectionDAG &DAG) {
14412 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14415 SDValue In = Op.getOperand(0);
14416 SDValue Idx = Op.getOperand(1);
14417 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14418 MVT ResVT = Op.getSimpleValueType();
14420 assert((In.getSimpleValueType().is256BitVector() ||
14421 In.getSimpleValueType().is512BitVector()) &&
14422 "Can only extract from 256-bit or 512-bit vectors");
14424 // If the input is a buildvector just emit a smaller one.
14425 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14426 if (In.getOpcode() == ISD::BUILD_VECTOR)
14427 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT,
14428 makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14430 // Everything else is legal.
14434 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14435 // simple superregister reference or explicit instructions to insert
14436 // the upper bits of a vector.
14437 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14438 SelectionDAG &DAG) {
14439 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14441 return insert1BitVector(Op, DAG, Subtarget);
14444 // Returns the appropriate wrapper opcode for a global reference.
14445 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14446 // References to absolute symbols are never PC-relative.
14447 if (GV && GV->isAbsoluteSymbolRef())
14448 return X86ISD::Wrapper;
14450 CodeModel::Model M = getTargetMachine().getCodeModel();
14451 if (Subtarget.isPICStyleRIPRel() &&
14452 (M == CodeModel::Small || M == CodeModel::Kernel))
14453 return X86ISD::WrapperRIP;
14455 return X86ISD::Wrapper;
14458 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14459 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14460 // one of the above mentioned nodes. It has to be wrapped because otherwise
14461 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14462 // be used to form addressing mode. These wrapped nodes will be selected
14465 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14466 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14468 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14469 // global base reg.
14470 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14472 auto PtrVT = getPointerTy(DAG.getDataLayout());
14473 SDValue Result = DAG.getTargetConstantPool(
14474 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14476 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14477 // With PIC, the address is actually $g + Offset.
14480 DAG.getNode(ISD::ADD, DL, PtrVT,
14481 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14487 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14488 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14490 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14491 // global base reg.
14492 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14494 auto PtrVT = getPointerTy(DAG.getDataLayout());
14495 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14497 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14499 // With PIC, the address is actually $g + Offset.
14502 DAG.getNode(ISD::ADD, DL, PtrVT,
14503 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14509 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14510 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14512 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14513 // global base reg.
14514 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14515 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14517 auto PtrVT = getPointerTy(DAG.getDataLayout());
14518 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14521 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14523 // With PIC, the address is actually $g + Offset.
14524 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14526 DAG.getNode(ISD::ADD, DL, PtrVT,
14527 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14530 // For symbols that require a load from a stub to get the address, emit the
14532 if (isGlobalStubReference(OpFlag))
14533 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14534 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14540 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14541 // Create the TargetBlockAddressAddress node.
14542 unsigned char OpFlags =
14543 Subtarget.classifyBlockAddressReference();
14544 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14545 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14547 auto PtrVT = getPointerTy(DAG.getDataLayout());
14548 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14549 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14551 // With PIC, the address is actually $g + Offset.
14552 if (isGlobalRelativeToPICBase(OpFlags)) {
14553 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14554 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14560 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14561 const SDLoc &dl, int64_t Offset,
14562 SelectionDAG &DAG) const {
14563 // Create the TargetGlobalAddress node, folding in the constant
14564 // offset if it is legal.
14565 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14566 CodeModel::Model M = DAG.getTarget().getCodeModel();
14567 auto PtrVT = getPointerTy(DAG.getDataLayout());
14569 if (OpFlags == X86II::MO_NO_FLAG &&
14570 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14571 // A direct static reference to a global.
14572 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14575 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14578 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14580 // With PIC, the address is actually $g + Offset.
14581 if (isGlobalRelativeToPICBase(OpFlags)) {
14582 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14583 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14586 // For globals that require a load from a stub to get the address, emit the
14588 if (isGlobalStubReference(OpFlags))
14589 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14590 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14592 // If there was a non-zero offset that we didn't fold, create an explicit
14593 // addition for it.
14595 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14596 DAG.getConstant(Offset, dl, PtrVT));
14602 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14603 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14604 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14605 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14609 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14610 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14611 unsigned char OperandFlags, bool LocalDynamic = false) {
14612 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14613 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14615 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14616 GA->getValueType(0),
14620 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14624 SDValue Ops[] = { Chain, TGA, *InFlag };
14625 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14627 SDValue Ops[] = { Chain, TGA };
14628 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14631 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14632 MFI.setAdjustsStack(true);
14633 MFI.setHasCalls(true);
14635 SDValue Flag = Chain.getValue(1);
14636 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14639 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14641 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14644 SDLoc dl(GA); // ? function entry point might be better
14645 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14646 DAG.getNode(X86ISD::GlobalBaseReg,
14647 SDLoc(), PtrVT), InFlag);
14648 InFlag = Chain.getValue(1);
14650 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14653 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14655 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14657 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14658 X86::RAX, X86II::MO_TLSGD);
14661 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14667 // Get the start address of the TLS block for this module.
14668 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14669 .getInfo<X86MachineFunctionInfo>();
14670 MFI->incNumLocalDynamicTLSAccesses();
14674 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14675 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14678 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14679 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14680 InFlag = Chain.getValue(1);
14681 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14682 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14685 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14689 unsigned char OperandFlags = X86II::MO_DTPOFF;
14690 unsigned WrapperKind = X86ISD::Wrapper;
14691 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14692 GA->getValueType(0),
14693 GA->getOffset(), OperandFlags);
14694 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14696 // Add x@dtpoff with the base.
14697 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14700 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14701 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14702 const EVT PtrVT, TLSModel::Model model,
14703 bool is64Bit, bool isPIC) {
14706 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14707 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14708 is64Bit ? 257 : 256));
14710 SDValue ThreadPointer =
14711 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14712 MachinePointerInfo(Ptr));
14714 unsigned char OperandFlags = 0;
14715 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14717 unsigned WrapperKind = X86ISD::Wrapper;
14718 if (model == TLSModel::LocalExec) {
14719 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14720 } else if (model == TLSModel::InitialExec) {
14722 OperandFlags = X86II::MO_GOTTPOFF;
14723 WrapperKind = X86ISD::WrapperRIP;
14725 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14728 llvm_unreachable("Unexpected model");
14731 // emit "addl x@ntpoff,%eax" (local exec)
14732 // or "addl x@indntpoff,%eax" (initial exec)
14733 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14735 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14736 GA->getOffset(), OperandFlags);
14737 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14739 if (model == TLSModel::InitialExec) {
14740 if (isPIC && !is64Bit) {
14741 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14742 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14746 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14747 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14750 // The address of the thread local variable is the add of the thread
14751 // pointer with the offset of the variable.
14752 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14756 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14758 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14760 if (DAG.getTarget().Options.EmulatedTLS)
14761 return LowerToTLSEmulatedModel(GA, DAG);
14763 const GlobalValue *GV = GA->getGlobal();
14764 auto PtrVT = getPointerTy(DAG.getDataLayout());
14765 bool PositionIndependent = isPositionIndependent();
14767 if (Subtarget.isTargetELF()) {
14768 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14770 case TLSModel::GeneralDynamic:
14771 if (Subtarget.is64Bit())
14772 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14773 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14774 case TLSModel::LocalDynamic:
14775 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14776 Subtarget.is64Bit());
14777 case TLSModel::InitialExec:
14778 case TLSModel::LocalExec:
14779 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14780 PositionIndependent);
14782 llvm_unreachable("Unknown TLS model.");
14785 if (Subtarget.isTargetDarwin()) {
14786 // Darwin only has one model of TLS. Lower to that.
14787 unsigned char OpFlag = 0;
14788 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14789 X86ISD::WrapperRIP : X86ISD::Wrapper;
14791 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14792 // global base reg.
14793 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14795 OpFlag = X86II::MO_TLVP_PIC_BASE;
14797 OpFlag = X86II::MO_TLVP;
14799 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14800 GA->getValueType(0),
14801 GA->getOffset(), OpFlag);
14802 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14804 // With PIC32, the address is actually $g + Offset.
14806 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14807 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14810 // Lowering the machine isd will make sure everything is in the right
14812 SDValue Chain = DAG.getEntryNode();
14813 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14814 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14815 SDValue Args[] = { Chain, Offset };
14816 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14817 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14818 DAG.getIntPtrConstant(0, DL, true),
14819 Chain.getValue(1), DL);
14821 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14822 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14823 MFI.setAdjustsStack(true);
14825 // And our return value (tls address) is in the standard call return value
14827 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14828 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14831 if (Subtarget.isTargetKnownWindowsMSVC() ||
14832 Subtarget.isTargetWindowsItanium() ||
14833 Subtarget.isTargetWindowsGNU()) {
14834 // Just use the implicit TLS architecture
14835 // Need to generate something similar to:
14836 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14838 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14839 // mov rcx, qword [rdx+rcx*8]
14840 // mov eax, .tls$:tlsvar
14841 // [rax+rcx] contains the address
14842 // Windows 64bit: gs:0x58
14843 // Windows 32bit: fs:__tls_array
14846 SDValue Chain = DAG.getEntryNode();
14848 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14849 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14850 // use its literal value of 0x2C.
14851 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14852 ? Type::getInt8PtrTy(*DAG.getContext(),
14854 : Type::getInt32PtrTy(*DAG.getContext(),
14857 SDValue TlsArray = Subtarget.is64Bit()
14858 ? DAG.getIntPtrConstant(0x58, dl)
14859 : (Subtarget.isTargetWindowsGNU()
14860 ? DAG.getIntPtrConstant(0x2C, dl)
14861 : DAG.getExternalSymbol("_tls_array", PtrVT));
14863 SDValue ThreadPointer =
14864 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14867 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14868 res = ThreadPointer;
14870 // Load the _tls_index variable
14871 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14872 if (Subtarget.is64Bit())
14873 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14874 MachinePointerInfo(), MVT::i32);
14876 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14878 auto &DL = DAG.getDataLayout();
14880 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14881 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14883 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14886 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14888 // Get the offset of start of .tls section
14889 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14890 GA->getValueType(0),
14891 GA->getOffset(), X86II::MO_SECREL);
14892 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14894 // The address of the thread local variable is the add of the thread
14895 // pointer with the offset of the variable.
14896 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14899 llvm_unreachable("TLS not implemented for this target.");
14902 /// Lower SRA_PARTS and friends, which return two i32 values
14903 /// and take a 2 x i32 value to shift plus a shift amount.
14904 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14905 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14906 MVT VT = Op.getSimpleValueType();
14907 unsigned VTBits = VT.getSizeInBits();
14909 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14910 SDValue ShOpLo = Op.getOperand(0);
14911 SDValue ShOpHi = Op.getOperand(1);
14912 SDValue ShAmt = Op.getOperand(2);
14913 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14914 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14916 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14917 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14918 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14919 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14920 : DAG.getConstant(0, dl, VT);
14922 SDValue Tmp2, Tmp3;
14923 if (Op.getOpcode() == ISD::SHL_PARTS) {
14924 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14925 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14927 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14928 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14931 // If the shift amount is larger or equal than the width of a part we can't
14932 // rely on the results of shld/shrd. Insert a test and select the appropriate
14933 // values for large shift amounts.
14934 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14935 DAG.getConstant(VTBits, dl, MVT::i8));
14936 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14937 AndNode, DAG.getConstant(0, dl, MVT::i8));
14940 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14941 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14942 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14944 if (Op.getOpcode() == ISD::SHL_PARTS) {
14945 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14946 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14948 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14949 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14952 SDValue Ops[2] = { Lo, Hi };
14953 return DAG.getMergeValues(Ops, dl);
14956 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14957 SelectionDAG &DAG) const {
14958 SDValue Src = Op.getOperand(0);
14959 MVT SrcVT = Src.getSimpleValueType();
14960 MVT VT = Op.getSimpleValueType();
14963 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14964 if (SrcVT.isVector()) {
14965 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14966 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14967 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14968 DAG.getUNDEF(SrcVT)));
14970 if (SrcVT.getVectorElementType() == MVT::i1) {
14971 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14972 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14973 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14974 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14975 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14976 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14981 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14982 "Unknown SINT_TO_FP to lower!");
14984 // These are really Legal; return the operand so the caller accepts it as
14986 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14988 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14989 Subtarget.is64Bit()) {
14993 SDValue ValueToStore = Op.getOperand(0);
14994 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14995 !Subtarget.is64Bit())
14996 // Bitcasting to f64 here allows us to do a single 64-bit store from
14997 // an SSE register, avoiding the store forwarding penalty that would come
14998 // with two 32-bit stores.
14999 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15001 unsigned Size = SrcVT.getSizeInBits()/8;
15002 MachineFunction &MF = DAG.getMachineFunction();
15003 auto PtrVT = getPointerTy(MF.getDataLayout());
15004 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15005 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15006 SDValue Chain = DAG.getStore(
15007 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15008 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15009 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15012 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15014 SelectionDAG &DAG) const {
15018 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15020 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15022 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15024 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15026 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15027 MachineMemOperand *MMO;
15029 int SSFI = FI->getIndex();
15030 MMO = DAG.getMachineFunction().getMachineMemOperand(
15031 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15032 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15034 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15035 StackSlot = StackSlot.getOperand(1);
15037 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15038 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15040 Tys, Ops, SrcVT, MMO);
15043 Chain = Result.getValue(1);
15044 SDValue InFlag = Result.getValue(2);
15046 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15047 // shouldn't be necessary except that RFP cannot be live across
15048 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15049 MachineFunction &MF = DAG.getMachineFunction();
15050 unsigned SSFISize = Op.getValueSizeInBits()/8;
15051 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15052 auto PtrVT = getPointerTy(MF.getDataLayout());
15053 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15054 Tys = DAG.getVTList(MVT::Other);
15056 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15058 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15059 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15060 MachineMemOperand::MOStore, SSFISize, SSFISize);
15062 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15063 Ops, Op.getValueType(), MMO);
15064 Result = DAG.getLoad(
15065 Op.getValueType(), DL, Chain, StackSlot,
15066 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15072 /// 64-bit unsigned integer to double expansion.
15073 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15074 SelectionDAG &DAG) const {
15075 // This algorithm is not obvious. Here it is what we're trying to output:
15078 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15079 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15081 haddpd %xmm0, %xmm0
15083 pshufd $0x4e, %xmm0, %xmm1
15089 LLVMContext *Context = DAG.getContext();
15091 // Build some magic constants.
15092 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15093 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15094 auto PtrVT = getPointerTy(DAG.getDataLayout());
15095 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15097 SmallVector<Constant*,2> CV1;
15099 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15100 APInt(64, 0x4330000000000000ULL))));
15102 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15103 APInt(64, 0x4530000000000000ULL))));
15104 Constant *C1 = ConstantVector::get(CV1);
15105 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15107 // Load the 64-bit value into an XMM register.
15108 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15111 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15112 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15113 /* Alignment = */ 16);
15115 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15118 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15119 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15120 /* Alignment = */ 16);
15121 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15122 // TODO: Are there any fast-math-flags to propagate here?
15123 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15126 if (Subtarget.hasSSE3()) {
15127 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15128 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15130 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15131 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15132 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15133 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15136 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15137 DAG.getIntPtrConstant(0, dl));
15140 /// 32-bit unsigned integer to float expansion.
15141 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15142 SelectionDAG &DAG) const {
15144 // FP constant to bias correct the final result.
15145 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15148 // Load the 32-bit value into an XMM register.
15149 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15152 // Zero out the upper parts of the register.
15153 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15155 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15156 DAG.getBitcast(MVT::v2f64, Load),
15157 DAG.getIntPtrConstant(0, dl));
15159 // Or the load with the bias.
15160 SDValue Or = DAG.getNode(
15161 ISD::OR, dl, MVT::v2i64,
15162 DAG.getBitcast(MVT::v2i64,
15163 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15164 DAG.getBitcast(MVT::v2i64,
15165 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15167 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15168 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15170 // Subtract the bias.
15171 // TODO: Are there any fast-math-flags to propagate here?
15172 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15174 // Handle final rounding.
15175 MVT DestVT = Op.getSimpleValueType();
15177 if (DestVT.bitsLT(MVT::f64))
15178 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15179 DAG.getIntPtrConstant(0, dl));
15180 if (DestVT.bitsGT(MVT::f64))
15181 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15183 // Handle final rounding.
15187 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15188 const X86Subtarget &Subtarget, SDLoc &DL) {
15189 if (Op.getSimpleValueType() != MVT::v2f64)
15192 SDValue N0 = Op.getOperand(0);
15193 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15195 // Legalize to v4i32 type.
15196 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15197 DAG.getUNDEF(MVT::v2i32));
15199 if (Subtarget.hasAVX512())
15200 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15202 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15203 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15204 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15205 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15207 // Two to the power of half-word-size.
15208 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15210 // Clear upper part of LO, lower HI.
15211 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15212 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15214 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15215 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15216 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15218 // Add the two halves.
15219 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15222 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15223 const X86Subtarget &Subtarget) {
15224 // The algorithm is the following:
15225 // #ifdef __SSE4_1__
15226 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15227 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15228 // (uint4) 0x53000000, 0xaa);
15230 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15231 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15233 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15234 // return (float4) lo + fhi;
15236 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15237 // reassociate the two FADDs, and if we do that, the algorithm fails
15238 // spectacularly (PR24512).
15239 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15240 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15241 // there's also the MachineCombiner reassociations happening on Machine IR.
15242 if (DAG.getTarget().Options.UnsafeFPMath)
15246 SDValue V = Op->getOperand(0);
15247 MVT VecIntVT = V.getSimpleValueType();
15248 bool Is128 = VecIntVT == MVT::v4i32;
15249 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15250 // If we convert to something else than the supported type, e.g., to v4f64,
15252 if (VecFloatVT != Op->getSimpleValueType(0))
15255 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15256 "Unsupported custom type");
15258 // In the #idef/#else code, we have in common:
15259 // - The vector of constants:
15265 // Create the splat vector for 0x4b000000.
15266 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15267 // Create the splat vector for 0x53000000.
15268 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15270 // Create the right shift.
15271 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15272 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15275 if (Subtarget.hasSSE41()) {
15276 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15277 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15278 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15279 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15280 // Low will be bitcasted right away, so do not bother bitcasting back to its
15282 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15283 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15284 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15285 // (uint4) 0x53000000, 0xaa);
15286 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15287 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15288 // High will be bitcasted right away, so do not bother bitcasting back to
15289 // its original type.
15290 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15291 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15293 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15294 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15295 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15296 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15298 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15299 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15302 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15303 SDValue VecCstFAdd = DAG.getConstantFP(
15304 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15306 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15307 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15308 // TODO: Are there any fast-math-flags to propagate here?
15310 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15311 // return (float4) lo + fhi;
15312 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15313 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15316 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15317 SelectionDAG &DAG) const {
15318 SDValue N0 = Op.getOperand(0);
15319 MVT SrcVT = N0.getSimpleValueType();
15322 if (SrcVT.getVectorElementType() == MVT::i1) {
15323 if (SrcVT == MVT::v2i1)
15324 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15325 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15326 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15327 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15328 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15331 switch (SrcVT.SimpleTy) {
15333 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15338 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15339 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15340 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15343 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15346 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15349 assert(Subtarget.hasAVX512());
15350 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15351 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15355 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15356 SelectionDAG &DAG) const {
15357 SDValue N0 = Op.getOperand(0);
15359 auto PtrVT = getPointerTy(DAG.getDataLayout());
15361 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15362 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15363 // the optimization here.
15364 if (DAG.SignBitIsZero(N0))
15365 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15367 if (Op.getSimpleValueType().isVector())
15368 return lowerUINT_TO_FP_vec(Op, DAG);
15370 MVT SrcVT = N0.getSimpleValueType();
15371 MVT DstVT = Op.getSimpleValueType();
15373 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15374 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15375 // Conversions from unsigned i32 to f32/f64 are legal,
15376 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15380 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15381 return LowerUINT_TO_FP_i64(Op, DAG);
15382 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15383 return LowerUINT_TO_FP_i32(Op, DAG);
15384 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15387 // Make a 64-bit buffer, and use it to build an FILD.
15388 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15389 if (SrcVT == MVT::i32) {
15390 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15391 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15392 StackSlot, MachinePointerInfo());
15393 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15394 OffsetSlot, MachinePointerInfo());
15395 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15399 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15400 SDValue ValueToStore = Op.getOperand(0);
15401 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15402 // Bitcasting to f64 here allows us to do a single 64-bit store from
15403 // an SSE register, avoiding the store forwarding penalty that would come
15404 // with two 32-bit stores.
15405 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15406 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15407 MachinePointerInfo());
15408 // For i64 source, we need to add the appropriate power of 2 if the input
15409 // was negative. This is the same as the optimization in
15410 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15411 // we must be careful to do the computation in x87 extended precision, not
15412 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15413 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15414 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15415 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15416 MachineMemOperand::MOLoad, 8, 8);
15418 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15419 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15420 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15423 APInt FF(32, 0x5F800000ULL);
15425 // Check whether the sign bit is set.
15426 SDValue SignSet = DAG.getSetCC(
15427 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15428 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15430 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15431 SDValue FudgePtr = DAG.getConstantPool(
15432 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15434 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15435 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15436 SDValue Four = DAG.getIntPtrConstant(4, dl);
15437 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
15439 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15441 // Load the value out, extending it from f32 to f80.
15442 // FIXME: Avoid the extend by constructing the right constant pool?
15443 SDValue Fudge = DAG.getExtLoad(
15444 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15445 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15446 /* Alignment = */ 4);
15447 // Extend everything to 80 bits to force it to be done on x87.
15448 // TODO: Are there any fast-math-flags to propagate here?
15449 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15450 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15451 DAG.getIntPtrConstant(0, dl));
15454 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15455 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15456 // just return an <SDValue(), SDValue()> pair.
15457 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15458 // to i16, i32 or i64, and we lower it to a legal sequence.
15459 // If lowered to the final integer result we return a <result, SDValue()> pair.
15460 // Otherwise we lower it to a sequence ending with a FIST, return a
15461 // <FIST, StackSlot> pair, and the caller is responsible for loading
15462 // the final integer result from StackSlot.
15463 std::pair<SDValue,SDValue>
15464 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15465 bool IsSigned, bool IsReplace) const {
15468 EVT DstTy = Op.getValueType();
15469 EVT TheVT = Op.getOperand(0).getValueType();
15470 auto PtrVT = getPointerTy(DAG.getDataLayout());
15472 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15473 // f16 must be promoted before using the lowering in this routine.
15474 // fp128 does not use this lowering.
15475 return std::make_pair(SDValue(), SDValue());
15478 // If using FIST to compute an unsigned i64, we'll need some fixup
15479 // to handle values above the maximum signed i64. A FIST is always
15480 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15481 bool UnsignedFixup = !IsSigned &&
15482 DstTy == MVT::i64 &&
15483 (!Subtarget.is64Bit() ||
15484 !isScalarFPTypeInSSEReg(TheVT));
15486 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15487 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15488 // The low 32 bits of the fist result will have the correct uint32 result.
15489 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15493 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15494 DstTy.getSimpleVT() >= MVT::i16 &&
15495 "Unknown FP_TO_INT to lower!");
15497 // These are really Legal.
15498 if (DstTy == MVT::i32 &&
15499 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15500 return std::make_pair(SDValue(), SDValue());
15501 if (Subtarget.is64Bit() &&
15502 DstTy == MVT::i64 &&
15503 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15504 return std::make_pair(SDValue(), SDValue());
15506 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15508 MachineFunction &MF = DAG.getMachineFunction();
15509 unsigned MemSize = DstTy.getSizeInBits()/8;
15510 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15511 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15514 switch (DstTy.getSimpleVT().SimpleTy) {
15515 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15516 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15517 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15518 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15521 SDValue Chain = DAG.getEntryNode();
15522 SDValue Value = Op.getOperand(0);
15523 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15525 if (UnsignedFixup) {
15527 // Conversion to unsigned i64 is implemented with a select,
15528 // depending on whether the source value fits in the range
15529 // of a signed i64. Let Thresh be the FP equivalent of
15530 // 0x8000000000000000ULL.
15532 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15533 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15534 // Fist-to-mem64 FistSrc
15535 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15536 // to XOR'ing the high 32 bits with Adjust.
15538 // Being a power of 2, Thresh is exactly representable in all FP formats.
15539 // For X87 we'd like to use the smallest FP type for this constant, but
15540 // for DAG type consistency we have to match the FP operand type.
15542 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15543 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15544 bool LosesInfo = false;
15545 if (TheVT == MVT::f64)
15546 // The rounding mode is irrelevant as the conversion should be exact.
15547 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15549 else if (TheVT == MVT::f80)
15550 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15551 APFloat::rmNearestTiesToEven, &LosesInfo);
15553 assert(Status == APFloat::opOK && !LosesInfo &&
15554 "FP conversion should have been exact");
15556 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15558 SDValue Cmp = DAG.getSetCC(DL,
15559 getSetCCResultType(DAG.getDataLayout(),
15560 *DAG.getContext(), TheVT),
15561 Value, ThreshVal, ISD::SETLT);
15562 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15563 DAG.getConstant(0, DL, MVT::i32),
15564 DAG.getConstant(0x80000000, DL, MVT::i32));
15565 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15566 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15567 *DAG.getContext(), TheVT),
15568 Value, ThreshVal, ISD::SETLT);
15569 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15572 // FIXME This causes a redundant load/store if the SSE-class value is already
15573 // in memory, such as if it is on the callstack.
15574 if (isScalarFPTypeInSSEReg(TheVT)) {
15575 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15576 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15577 MachinePointerInfo::getFixedStack(MF, SSFI));
15578 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15580 Chain, StackSlot, DAG.getValueType(TheVT)
15583 MachineMemOperand *MMO =
15584 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15585 MachineMemOperand::MOLoad, MemSize, MemSize);
15586 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15587 Chain = Value.getValue(1);
15588 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15589 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15592 MachineMemOperand *MMO =
15593 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15594 MachineMemOperand::MOStore, MemSize, MemSize);
15596 if (UnsignedFixup) {
15598 // Insert the FIST, load its result as two i32's,
15599 // and XOR the high i32 with Adjust.
15601 SDValue FistOps[] = { Chain, Value, StackSlot };
15602 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15603 FistOps, DstTy, MMO);
15606 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15607 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15610 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15611 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15613 if (Subtarget.is64Bit()) {
15614 // Join High32 and Low32 into a 64-bit result.
15615 // (High32 << 32) | Low32
15616 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15617 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15618 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15619 DAG.getConstant(32, DL, MVT::i8));
15620 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15621 return std::make_pair(Result, SDValue());
15624 SDValue ResultOps[] = { Low32, High32 };
15626 SDValue pair = IsReplace
15627 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15628 : DAG.getMergeValues(ResultOps, DL);
15629 return std::make_pair(pair, SDValue());
15631 // Build the FP_TO_INT*_IN_MEM
15632 SDValue Ops[] = { Chain, Value, StackSlot };
15633 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15635 return std::make_pair(FIST, StackSlot);
15639 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15640 const X86Subtarget &Subtarget) {
15641 MVT VT = Op->getSimpleValueType(0);
15642 SDValue In = Op->getOperand(0);
15643 MVT InVT = In.getSimpleValueType();
15646 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15647 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15649 // Optimize vectors in AVX mode:
15652 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15653 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15654 // Concat upper and lower parts.
15657 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15658 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15659 // Concat upper and lower parts.
15662 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15663 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15664 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15667 if (Subtarget.hasInt256())
15668 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15670 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15671 SDValue Undef = DAG.getUNDEF(InVT);
15672 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15673 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15674 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15676 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15677 VT.getVectorNumElements()/2);
15679 OpLo = DAG.getBitcast(HVT, OpLo);
15680 OpHi = DAG.getBitcast(HVT, OpHi);
15682 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15685 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15686 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15687 MVT VT = Op->getSimpleValueType(0);
15688 SDValue In = Op->getOperand(0);
15689 MVT InVT = In.getSimpleValueType();
15691 unsigned NumElts = VT.getVectorNumElements();
15693 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15694 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15695 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15697 if (InVT.getVectorElementType() != MVT::i1)
15700 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15702 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15703 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15706 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15708 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15710 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15712 return SelectedVal;
15713 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15716 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15717 SelectionDAG &DAG) {
15718 if (Subtarget.hasFp256())
15719 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15725 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15726 SelectionDAG &DAG) {
15728 MVT VT = Op.getSimpleValueType();
15729 SDValue In = Op.getOperand(0);
15730 MVT SVT = In.getSimpleValueType();
15732 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15733 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15735 if (Subtarget.hasFp256())
15736 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15739 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15740 VT.getVectorNumElements() != SVT.getVectorNumElements());
15744 /// Helper to recursively truncate vector elements in half with PACKSS.
15745 /// It makes use of the fact that vector comparison results will be all-zeros
15746 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15747 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15748 /// within each 128-bit lane.
15749 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15752 const X86Subtarget &Subtarget) {
15753 // Requires SSE2 but AVX512 has fast truncate.
15754 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15757 EVT SrcVT = In.getValueType();
15759 // No truncation required, we might get here due to recursive calls.
15760 if (SrcVT == DstVT)
15763 // We only support vector truncation to 128bits or greater from a
15764 // 256bits or greater source.
15765 if ((DstVT.getSizeInBits() % 128) != 0)
15767 if ((SrcVT.getSizeInBits() % 256) != 0)
15770 unsigned NumElems = SrcVT.getVectorNumElements();
15771 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15772 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15775 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15777 // Extract lower/upper subvectors.
15778 unsigned NumSubElts = NumElems / 2;
15779 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15780 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15781 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15783 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15784 if (SrcVT.is256BitVector()) {
15785 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15786 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15787 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15788 return DAG.getBitcast(DstVT, Res);
15791 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15792 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15793 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15794 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15795 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15796 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15798 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15799 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15800 Res = DAG.getBitcast(MVT::v4i64, Res);
15801 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15803 if (DstVT.is256BitVector())
15804 return DAG.getBitcast(DstVT, Res);
15806 // If 512bit -> 128bit truncate another stage.
15807 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15808 Res = DAG.getBitcast(PackedVT, Res);
15809 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15812 // Recursively pack lower/upper subvectors, concat result and pack again.
15813 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15814 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15815 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15816 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15818 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15819 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15820 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15823 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15824 const X86Subtarget &Subtarget) {
15827 MVT VT = Op.getSimpleValueType();
15828 SDValue In = Op.getOperand(0);
15829 MVT InVT = In.getSimpleValueType();
15831 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15833 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15834 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15835 if (InVT.getScalarSizeInBits() <= 16) {
15836 if (Subtarget.hasBWI()) {
15837 // legal, will go to VPMOVB2M, VPMOVW2M
15838 // Shift packed bytes not supported natively, bitcast to word
15839 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15840 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15841 DAG.getBitcast(ExtVT, In),
15842 DAG.getConstant(ShiftInx, DL, ExtVT));
15843 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15844 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15846 // Use TESTD/Q, extended vector to packed dword/qword.
15847 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15848 "Unexpected vector type.");
15849 unsigned NumElts = InVT.getVectorNumElements();
15850 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15851 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15853 ShiftInx = InVT.getScalarSizeInBits() - 1;
15856 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15857 DAG.getConstant(ShiftInx, DL, InVT));
15858 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15861 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15863 MVT VT = Op.getSimpleValueType();
15864 SDValue In = Op.getOperand(0);
15865 MVT InVT = In.getSimpleValueType();
15867 if (VT == MVT::i1) {
15868 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15869 "Invalid scalar TRUNCATE operation");
15870 if (InVT.getSizeInBits() >= 32)
15872 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15873 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15875 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15876 "Invalid TRUNCATE operation");
15878 if (VT.getVectorElementType() == MVT::i1)
15879 return LowerTruncateVecI1(Op, DAG, Subtarget);
15881 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15882 if (Subtarget.hasAVX512()) {
15883 // word to byte only under BWI
15884 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15885 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15886 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15887 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15890 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15891 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15892 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15895 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15896 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15897 if (Subtarget.hasInt256()) {
15898 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15899 In = DAG.getBitcast(MVT::v8i32, In);
15900 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15901 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15902 DAG.getIntPtrConstant(0, DL));
15905 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15906 DAG.getIntPtrConstant(0, DL));
15907 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15908 DAG.getIntPtrConstant(2, DL));
15909 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15910 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15911 static const int ShufMask[] = {0, 2, 4, 6};
15912 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15915 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15916 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15917 if (Subtarget.hasInt256()) {
15918 In = DAG.getBitcast(MVT::v32i8, In);
15920 // The PSHUFB mask:
15921 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
15922 -1, -1, -1, -1, -1, -1, -1, -1,
15923 16, 17, 20, 21, 24, 25, 28, 29,
15924 -1, -1, -1, -1, -1, -1, -1, -1 };
15925 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15926 In = DAG.getBitcast(MVT::v4i64, In);
15928 static const int ShufMask2[] = {0, 2, -1, -1};
15929 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
15930 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15931 DAG.getIntPtrConstant(0, DL));
15932 return DAG.getBitcast(VT, In);
15935 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15936 DAG.getIntPtrConstant(0, DL));
15938 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15939 DAG.getIntPtrConstant(4, DL));
15941 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15942 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15944 // The PSHUFB mask:
15945 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15946 -1, -1, -1, -1, -1, -1, -1, -1};
15948 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15949 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15951 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15952 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15954 // The MOVLHPS Mask:
15955 static const int ShufMask2[] = {0, 1, 4, 5};
15956 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15957 return DAG.getBitcast(MVT::v8i16, res);
15960 // Handle truncation of V256 to V128 using shuffles.
15961 if (!VT.is128BitVector() || !InVT.is256BitVector())
15964 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15966 unsigned NumElems = VT.getVectorNumElements();
15967 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15969 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15970 // Prepare truncation shuffle mask
15971 for (unsigned i = 0; i != NumElems; ++i)
15972 MaskVec[i] = i * 2;
15973 In = DAG.getBitcast(NVT, In);
15974 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15975 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15976 DAG.getIntPtrConstant(0, DL));
15979 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15980 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15981 MVT VT = Op.getSimpleValueType();
15983 if (VT.isVector()) {
15984 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15985 SDValue Src = Op.getOperand(0);
15987 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15988 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15989 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15990 DAG.getUNDEF(MVT::v2f32)));
15996 assert(!VT.isVector());
15998 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15999 IsSigned, /*IsReplace=*/ false);
16000 SDValue FIST = Vals.first, StackSlot = Vals.second;
16001 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16002 if (!FIST.getNode())
16005 if (StackSlot.getNode())
16006 // Load the result.
16007 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16009 // The node is the result.
16013 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16015 MVT VT = Op.getSimpleValueType();
16016 SDValue In = Op.getOperand(0);
16017 MVT SVT = In.getSimpleValueType();
16019 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16021 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16022 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16023 In, DAG.getUNDEF(SVT)));
16026 /// The only differences between FABS and FNEG are the mask and the logic op.
16027 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16028 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16029 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16030 "Wrong opcode for lowering FABS or FNEG.");
16032 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16034 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16035 // into an FNABS. We'll lower the FABS after that if it is still in use.
16037 for (SDNode *User : Op->uses())
16038 if (User->getOpcode() == ISD::FNEG)
16042 MVT VT = Op.getSimpleValueType();
16044 bool IsF128 = (VT == MVT::f128);
16046 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16047 // decide if we should generate a 16-byte constant mask when we only need 4 or
16048 // 8 bytes for the scalar case.
16053 if (VT.isVector()) {
16055 EltVT = VT.getVectorElementType();
16056 } else if (IsF128) {
16057 // SSE instructions are used for optimized f128 logical operations.
16058 LogicVT = MVT::f128;
16061 // There are no scalar bitwise logical SSE/AVX instructions, so we
16062 // generate a 16-byte vector constant and logic op even for the scalar case.
16063 // Using a 16-byte mask allows folding the load of the mask with
16064 // the logic op, so it can save (~4 bytes) on code size.
16065 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16069 unsigned EltBits = EltVT.getSizeInBits();
16070 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16072 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16073 const fltSemantics &Sem =
16074 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16075 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16076 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16078 SDValue Op0 = Op.getOperand(0);
16079 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16081 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16082 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16084 if (VT.isVector() || IsF128)
16085 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16087 // For the scalar case extend to a 128-bit vector, perform the logic op,
16088 // and extract the scalar result back out.
16089 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16090 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16091 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16092 DAG.getIntPtrConstant(0, dl));
16095 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16096 SDValue Mag = Op.getOperand(0);
16097 SDValue Sign = Op.getOperand(1);
16100 // If the sign operand is smaller, extend it first.
16101 MVT VT = Op.getSimpleValueType();
16102 if (Sign.getSimpleValueType().bitsLT(VT))
16103 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16105 // And if it is bigger, shrink it first.
16106 if (Sign.getSimpleValueType().bitsGT(VT))
16107 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16109 // At this point the operands and the result should have the same
16110 // type, and that won't be f80 since that is not custom lowered.
16111 bool IsF128 = (VT == MVT::f128);
16112 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16113 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16114 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16115 "Unexpected type in LowerFCOPYSIGN");
16117 MVT EltVT = VT.getScalarType();
16118 const fltSemantics &Sem =
16119 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16120 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16122 // Perform all scalar logic operations as 16-byte vectors because there are no
16123 // scalar FP logic instructions in SSE.
16124 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16125 // unnecessary splats, but we might miss load folding opportunities. Should
16126 // this decision be based on OptimizeForSize?
16127 bool IsFakeVector = !VT.isVector() && !IsF128;
16130 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16132 // The mask constants are automatically splatted for vector types.
16133 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16134 SDValue SignMask = DAG.getConstantFP(
16135 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16136 SDValue MagMask = DAG.getConstantFP(
16137 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16139 // First, clear all bits but the sign bit from the second operand (sign).
16141 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16142 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16144 // Next, clear the sign bit from the first operand (magnitude).
16145 // TODO: If we had general constant folding for FP logic ops, this check
16146 // wouldn't be necessary.
16148 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16149 APFloat APF = Op0CN->getValueAPF();
16151 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16153 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16155 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16156 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16159 // OR the magnitude value with the sign bit.
16160 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16161 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16162 DAG.getIntPtrConstant(0, dl));
16165 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16166 SDValue N0 = Op.getOperand(0);
16168 MVT VT = Op.getSimpleValueType();
16170 MVT OpVT = N0.getSimpleValueType();
16171 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16172 "Unexpected type for FGETSIGN");
16174 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16175 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16176 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16177 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16178 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16179 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16183 // Check whether an OR'd tree is PTEST-able.
16184 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16185 SelectionDAG &DAG) {
16186 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16188 if (!Subtarget.hasSSE41())
16191 if (!Op->hasOneUse())
16194 SDNode *N = Op.getNode();
16197 SmallVector<SDValue, 8> Opnds;
16198 DenseMap<SDValue, unsigned> VecInMap;
16199 SmallVector<SDValue, 8> VecIns;
16200 EVT VT = MVT::Other;
16202 // Recognize a special case where a vector is casted into wide integer to
16204 Opnds.push_back(N->getOperand(0));
16205 Opnds.push_back(N->getOperand(1));
16207 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16208 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16209 // BFS traverse all OR'd operands.
16210 if (I->getOpcode() == ISD::OR) {
16211 Opnds.push_back(I->getOperand(0));
16212 Opnds.push_back(I->getOperand(1));
16213 // Re-evaluate the number of nodes to be traversed.
16214 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16218 // Quit if a non-EXTRACT_VECTOR_ELT
16219 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16222 // Quit if without a constant index.
16223 SDValue Idx = I->getOperand(1);
16224 if (!isa<ConstantSDNode>(Idx))
16227 SDValue ExtractedFromVec = I->getOperand(0);
16228 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16229 if (M == VecInMap.end()) {
16230 VT = ExtractedFromVec.getValueType();
16231 // Quit if not 128/256-bit vector.
16232 if (!VT.is128BitVector() && !VT.is256BitVector())
16234 // Quit if not the same type.
16235 if (VecInMap.begin() != VecInMap.end() &&
16236 VT != VecInMap.begin()->first.getValueType())
16238 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16239 VecIns.push_back(ExtractedFromVec);
16241 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16244 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16245 "Not extracted from 128-/256-bit vector.");
16247 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16249 for (DenseMap<SDValue, unsigned>::const_iterator
16250 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16251 // Quit if not all elements are used.
16252 if (I->second != FullMask)
16256 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16258 // Cast all vectors into TestVT for PTEST.
16259 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16260 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16262 // If more than one full vector is evaluated, OR them first before PTEST.
16263 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16264 // Each iteration will OR 2 nodes and append the result until there is only
16265 // 1 node left, i.e. the final OR'd value of all vectors.
16266 SDValue LHS = VecIns[Slot];
16267 SDValue RHS = VecIns[Slot + 1];
16268 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16271 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16274 /// \brief return true if \c Op has a use that doesn't just read flags.
16275 static bool hasNonFlagsUse(SDValue Op) {
16276 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16278 SDNode *User = *UI;
16279 unsigned UOpNo = UI.getOperandNo();
16280 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16281 // Look pass truncate.
16282 UOpNo = User->use_begin().getOperandNo();
16283 User = *User->use_begin();
16286 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16287 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16293 // Emit KTEST instruction for bit vectors on AVX-512
16294 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16295 const X86Subtarget &Subtarget) {
16296 if (Op.getOpcode() == ISD::BITCAST) {
16297 auto hasKTEST = [&](MVT VT) {
16298 unsigned SizeInBits = VT.getSizeInBits();
16299 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16300 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16302 SDValue Op0 = Op.getOperand(0);
16303 MVT Op0VT = Op0.getValueType().getSimpleVT();
16304 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16306 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16311 /// Emit nodes that will be selected as "test Op0,Op0", or something
16313 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16314 SelectionDAG &DAG) const {
16315 if (Op.getValueType() == MVT::i1) {
16316 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16317 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16318 DAG.getConstant(0, dl, MVT::i8));
16320 // CF and OF aren't always set the way we want. Determine which
16321 // of these we need.
16322 bool NeedCF = false;
16323 bool NeedOF = false;
16326 case X86::COND_A: case X86::COND_AE:
16327 case X86::COND_B: case X86::COND_BE:
16330 case X86::COND_G: case X86::COND_GE:
16331 case X86::COND_L: case X86::COND_LE:
16332 case X86::COND_O: case X86::COND_NO: {
16333 // Check if we really need to set the
16334 // Overflow flag. If NoSignedWrap is present
16335 // that is not actually needed.
16336 switch (Op->getOpcode()) {
16341 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
16342 if (BinNode->Flags.hasNoSignedWrap())
16352 // See if we can use the EFLAGS value from the operand instead of
16353 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16354 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16355 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16356 // Emit KTEST for bit vectors
16357 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16359 // Emit a CMP with 0, which is the TEST pattern.
16360 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16361 DAG.getConstant(0, dl, Op.getValueType()));
16363 unsigned Opcode = 0;
16364 unsigned NumOperands = 0;
16366 // Truncate operations may prevent the merge of the SETCC instruction
16367 // and the arithmetic instruction before it. Attempt to truncate the operands
16368 // of the arithmetic instruction and use a reduced bit-width instruction.
16369 bool NeedTruncation = false;
16370 SDValue ArithOp = Op;
16371 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16372 SDValue Arith = Op->getOperand(0);
16373 // Both the trunc and the arithmetic op need to have one user each.
16374 if (Arith->hasOneUse())
16375 switch (Arith.getOpcode()) {
16382 NeedTruncation = true;
16388 // Sometimes flags can be set either with an AND or with an SRL/SHL
16389 // instruction. SRL/SHL variant should be preferred for masks longer than this
16391 const int ShiftToAndMaxMaskWidth = 32;
16392 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16394 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16395 // which may be the result of a CAST. We use the variable 'Op', which is the
16396 // non-casted variable when we check for possible users.
16397 switch (ArithOp.getOpcode()) {
16399 // Due to an isel shortcoming, be conservative if this add is likely to be
16400 // selected as part of a load-modify-store instruction. When the root node
16401 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16402 // uses of other nodes in the match, such as the ADD in this case. This
16403 // leads to the ADD being left around and reselected, with the result being
16404 // two adds in the output. Alas, even if none our users are stores, that
16405 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16406 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16407 // climbing the DAG back to the root, and it doesn't seem to be worth the
16409 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16410 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16411 if (UI->getOpcode() != ISD::CopyToReg &&
16412 UI->getOpcode() != ISD::SETCC &&
16413 UI->getOpcode() != ISD::STORE)
16416 if (ConstantSDNode *C =
16417 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16418 // An add of one will be selected as an INC.
16419 if (C->isOne() && !Subtarget.slowIncDec()) {
16420 Opcode = X86ISD::INC;
16425 // An add of negative one (subtract of one) will be selected as a DEC.
16426 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16427 Opcode = X86ISD::DEC;
16433 // Otherwise use a regular EFLAGS-setting add.
16434 Opcode = X86ISD::ADD;
16439 // If we have a constant logical shift that's only used in a comparison
16440 // against zero turn it into an equivalent AND. This allows turning it into
16441 // a TEST instruction later.
16442 if (ZeroCheck && Op->hasOneUse() &&
16443 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16444 EVT VT = Op.getValueType();
16445 unsigned BitWidth = VT.getSizeInBits();
16446 unsigned ShAmt = Op->getConstantOperandVal(1);
16447 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16449 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16450 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16451 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16452 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16454 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16455 DAG.getConstant(Mask, dl, VT));
16460 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16461 // because a TEST instruction will be better. However, AND should be
16462 // preferred if the instruction can be combined into ANDN.
16463 if (!hasNonFlagsUse(Op)) {
16464 SDValue Op0 = ArithOp->getOperand(0);
16465 SDValue Op1 = ArithOp->getOperand(1);
16466 EVT VT = ArithOp.getValueType();
16467 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16468 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16469 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16471 // If we cannot select an ANDN instruction, check if we can replace
16472 // AND+IMM64 with a shift before giving up. This is possible for masks
16473 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16474 if (!isProperAndn) {
16478 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16479 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16483 const APInt &Mask = CN->getAPIntValue();
16484 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16485 break; // Prefer TEST instruction.
16487 unsigned BitWidth = Mask.getBitWidth();
16488 unsigned LeadingOnes = Mask.countLeadingOnes();
16489 unsigned TrailingZeros = Mask.countTrailingZeros();
16491 if (LeadingOnes + TrailingZeros == BitWidth) {
16492 assert(TrailingZeros < VT.getSizeInBits() &&
16493 "Shift amount should be less than the type width");
16494 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16495 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16496 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16500 unsigned LeadingZeros = Mask.countLeadingZeros();
16501 unsigned TrailingOnes = Mask.countTrailingOnes();
16503 if (LeadingZeros + TrailingOnes == BitWidth) {
16504 assert(LeadingZeros < VT.getSizeInBits() &&
16505 "Shift amount should be less than the type width");
16506 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16507 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16508 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16519 // Due to the ISEL shortcoming noted above, be conservative if this op is
16520 // likely to be selected as part of a load-modify-store instruction.
16521 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16522 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16523 if (UI->getOpcode() == ISD::STORE)
16526 // Otherwise use a regular EFLAGS-setting instruction.
16527 switch (ArithOp.getOpcode()) {
16528 default: llvm_unreachable("unexpected operator!");
16529 case ISD::SUB: Opcode = X86ISD::SUB; break;
16530 case ISD::XOR: Opcode = X86ISD::XOR; break;
16531 case ISD::AND: Opcode = X86ISD::AND; break;
16533 if (!NeedTruncation && ZeroCheck) {
16534 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16537 Opcode = X86ISD::OR;
16551 return SDValue(Op.getNode(), 1);
16557 // If we found that truncation is beneficial, perform the truncation and
16559 if (NeedTruncation) {
16560 EVT VT = Op.getValueType();
16561 SDValue WideVal = Op->getOperand(0);
16562 EVT WideVT = WideVal.getValueType();
16563 unsigned ConvertedOp = 0;
16564 // Use a target machine opcode to prevent further DAGCombine
16565 // optimizations that may separate the arithmetic operations
16566 // from the setcc node.
16567 switch (WideVal.getOpcode()) {
16569 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16570 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16571 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16572 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16573 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16578 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16579 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16580 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16581 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16587 // Emit KTEST for bit vectors
16588 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16591 // Emit a CMP with 0, which is the TEST pattern.
16592 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16593 DAG.getConstant(0, dl, Op.getValueType()));
16595 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16596 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16598 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16599 DAG.ReplaceAllUsesWith(Op, New);
16600 return SDValue(New.getNode(), 1);
16603 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16605 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16606 const SDLoc &dl, SelectionDAG &DAG) const {
16607 if (isNullConstant(Op1))
16608 return EmitTest(Op0, X86CC, dl, DAG);
16610 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16611 "Unexpected comparison operation for MVT::i1 operands");
16613 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16614 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16615 // Only promote the compare up to I32 if it is a 16 bit operation
16616 // with an immediate. 16 bit immediates are to be avoided.
16617 if ((Op0.getValueType() == MVT::i16 &&
16618 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16619 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16620 !Subtarget.isAtom()) {
16621 unsigned ExtendOp =
16622 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16623 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16624 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16626 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16627 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16628 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16630 return SDValue(Sub.getNode(), 1);
16632 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16635 /// Convert a comparison if required by the subtarget.
16636 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16637 SelectionDAG &DAG) const {
16638 // If the subtarget does not support the FUCOMI instruction, floating-point
16639 // comparisons have to be converted.
16640 if (Subtarget.hasCMov() ||
16641 Cmp.getOpcode() != X86ISD::CMP ||
16642 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16643 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16646 // The instruction selector will select an FUCOM instruction instead of
16647 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16648 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16649 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16651 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16652 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16653 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16654 DAG.getConstant(8, dl, MVT::i8));
16655 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16657 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16658 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16659 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16662 /// Check if replacement of SQRT with RSQRT should be disabled.
16663 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16664 EVT VT = Op.getValueType();
16666 // We never want to use both SQRT and RSQRT instructions for the same input.
16667 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16671 return Subtarget.hasFastVectorFSQRT();
16672 return Subtarget.hasFastScalarFSQRT();
16675 /// The minimum architected relative accuracy is 2^-12. We need one
16676 /// Newton-Raphson step to have a good float result (24 bits of precision).
16677 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16678 SelectionDAG &DAG, int Enabled,
16679 int &RefinementSteps,
16680 bool &UseOneConstNR,
16681 bool Reciprocal) const {
16682 EVT VT = Op.getValueType();
16684 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16685 // TODO: Add support for AVX512 (v16f32).
16686 // It is likely not profitable to do this for f64 because a double-precision
16687 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16688 // instructions: convert to single, rsqrtss, convert back to double, refine
16689 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16690 // along with FMA, this could be a throughput win.
16691 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16692 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16693 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16694 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16695 RefinementSteps = 1;
16697 UseOneConstNR = false;
16698 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16703 /// The minimum architected relative accuracy is 2^-12. We need one
16704 /// Newton-Raphson step to have a good float result (24 bits of precision).
16705 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16707 int &RefinementSteps) const {
16708 EVT VT = Op.getValueType();
16710 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16711 // TODO: Add support for AVX512 (v16f32).
16712 // It is likely not profitable to do this for f64 because a double-precision
16713 // reciprocal estimate with refinement on x86 prior to FMA requires
16714 // 15 instructions: convert to single, rcpss, convert back to double, refine
16715 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16716 // along with FMA, this could be a throughput win.
16718 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16719 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16720 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16721 // Enable estimate codegen with 1 refinement step for vector division.
16722 // Scalar division estimates are disabled because they break too much
16723 // real-world code. These defaults are intended to match GCC behavior.
16724 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16727 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16728 RefinementSteps = 1;
16730 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16735 /// If we have at least two divisions that use the same divisor, convert to
16736 /// multiplication by a reciprocal. This may need to be adjusted for a given
16737 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16738 /// This is because we still need one division to calculate the reciprocal and
16739 /// then we need two multiplies by that reciprocal as replacements for the
16740 /// original divisions.
16741 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16745 /// Helper for creating a X86ISD::SETCC node.
16746 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16747 SelectionDAG &DAG) {
16748 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16749 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16752 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16753 /// according to equal/not-equal condition code \p CC.
16754 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16755 const SDLoc &dl, SelectionDAG &DAG) {
16756 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16757 // instruction. Since the shift amount is in-range-or-undefined, we know
16758 // that doing a bittest on the i32 value is ok. We extend to i32 because
16759 // the encoding for the i16 version is larger than the i32 version.
16760 // Also promote i16 to i32 for performance / code size reason.
16761 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16762 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16764 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16765 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16766 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16767 // known to be zero.
16768 if (Src.getValueType() == MVT::i64 &&
16769 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16770 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16772 // If the operand types disagree, extend the shift amount to match. Since
16773 // BT ignores high bits (like shifts) we can use anyextend.
16774 if (Src.getValueType() != BitNo.getValueType())
16775 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16777 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16778 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16779 return getSETCC(Cond, BT, dl , DAG);
16782 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16783 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16784 const SDLoc &dl, SelectionDAG &DAG) {
16785 SDValue Op0 = And.getOperand(0);
16786 SDValue Op1 = And.getOperand(1);
16787 if (Op0.getOpcode() == ISD::TRUNCATE)
16788 Op0 = Op0.getOperand(0);
16789 if (Op1.getOpcode() == ISD::TRUNCATE)
16790 Op1 = Op1.getOperand(0);
16793 if (Op1.getOpcode() == ISD::SHL)
16794 std::swap(Op0, Op1);
16795 if (Op0.getOpcode() == ISD::SHL) {
16796 if (isOneConstant(Op0.getOperand(0))) {
16797 // If we looked past a truncate, check that it's only truncating away
16799 unsigned BitWidth = Op0.getValueSizeInBits();
16800 unsigned AndBitWidth = And.getValueSizeInBits();
16801 if (BitWidth > AndBitWidth) {
16803 DAG.computeKnownBits(Op0, Zeros, Ones);
16804 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16808 RHS = Op0.getOperand(1);
16810 } else if (Op1.getOpcode() == ISD::Constant) {
16811 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16812 uint64_t AndRHSVal = AndRHS->getZExtValue();
16813 SDValue AndLHS = Op0;
16815 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16816 LHS = AndLHS.getOperand(0);
16817 RHS = AndLHS.getOperand(1);
16820 // Use BT if the immediate can't be encoded in a TEST instruction.
16821 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16823 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16828 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16833 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16834 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16835 const SDLoc &dl, SelectionDAG &DAG) {
16837 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16838 "Expected TRUNCATE to i1 node");
16840 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16843 SDValue ShiftRight = Op.getOperand(0);
16844 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16848 /// Result of 'and' or 'trunc to i1' is compared against zero.
16849 /// Change to a BT node if possible.
16850 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16851 const SDLoc &dl, SelectionDAG &DAG) const {
16852 if (Op.getOpcode() == ISD::AND)
16853 return LowerAndToBT(Op, CC, dl, DAG);
16854 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16855 return LowerTruncateToBT(Op, CC, dl, DAG);
16859 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16861 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16866 // SSE Condition code mapping:
16875 switch (SetCCOpcode) {
16876 default: llvm_unreachable("Unexpected SETCC condition");
16878 case ISD::SETEQ: SSECC = 0; break;
16880 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16882 case ISD::SETOLT: SSECC = 1; break;
16884 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16886 case ISD::SETOLE: SSECC = 2; break;
16887 case ISD::SETUO: SSECC = 3; break;
16889 case ISD::SETNE: SSECC = 4; break;
16890 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16891 case ISD::SETUGE: SSECC = 5; break;
16892 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16893 case ISD::SETUGT: SSECC = 6; break;
16894 case ISD::SETO: SSECC = 7; break;
16896 case ISD::SETONE: SSECC = 8; break;
16899 std::swap(Op0, Op1);
16904 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16905 /// concatenate the result back.
16906 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16907 MVT VT = Op.getSimpleValueType();
16909 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16910 "Unsupported value type for operation");
16912 unsigned NumElems = VT.getVectorNumElements();
16914 SDValue CC = Op.getOperand(2);
16916 // Extract the LHS vectors
16917 SDValue LHS = Op.getOperand(0);
16918 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16919 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16921 // Extract the RHS vectors
16922 SDValue RHS = Op.getOperand(1);
16923 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16924 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16926 // Issue the operation on the smaller types and concatenate the result back
16927 MVT EltVT = VT.getVectorElementType();
16928 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16929 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16930 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16931 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16934 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16935 SDValue Op0 = Op.getOperand(0);
16936 SDValue Op1 = Op.getOperand(1);
16937 SDValue CC = Op.getOperand(2);
16938 MVT VT = Op.getSimpleValueType();
16941 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16942 "Unexpected type for boolean compare operation");
16943 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16944 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16945 DAG.getConstant(-1, dl, VT));
16946 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16947 DAG.getConstant(-1, dl, VT));
16948 switch (SetCCOpcode) {
16949 default: llvm_unreachable("Unexpected SETCC condition");
16951 // (x == y) -> ~(x ^ y)
16952 return DAG.getNode(ISD::XOR, dl, VT,
16953 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16954 DAG.getConstant(-1, dl, VT));
16956 // (x != y) -> (x ^ y)
16957 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16960 // (x > y) -> (x & ~y)
16961 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16964 // (x < y) -> (~x & y)
16965 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16968 // (x <= y) -> (~x | y)
16969 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16972 // (x >=y) -> (x | ~y)
16973 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16977 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16979 SDValue Op0 = Op.getOperand(0);
16980 SDValue Op1 = Op.getOperand(1);
16981 SDValue CC = Op.getOperand(2);
16982 MVT VT = Op.getSimpleValueType();
16985 assert(VT.getVectorElementType() == MVT::i1 &&
16986 "Cannot set masked compare for this operation");
16988 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16990 bool Unsigned = false;
16993 switch (SetCCOpcode) {
16994 default: llvm_unreachable("Unexpected SETCC condition");
16995 case ISD::SETNE: SSECC = 4; break;
16996 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16997 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16998 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16999 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17000 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17001 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17002 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17003 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17004 case ISD::SETLE: SSECC = 2; break;
17008 std::swap(Op0, Op1);
17010 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17011 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17012 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17013 DAG.getConstant(SSECC, dl, MVT::i8));
17016 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17017 /// operand \p Op1. If non-trivial (for example because it's not constant)
17018 /// return an empty value.
17019 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17020 SelectionDAG &DAG) {
17021 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17025 MVT VT = Op1.getSimpleValueType();
17026 MVT EVT = VT.getVectorElementType();
17027 unsigned n = VT.getVectorNumElements();
17028 SmallVector<SDValue, 8> ULTOp1;
17030 for (unsigned i = 0; i < n; ++i) {
17031 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17032 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17035 // Avoid underflow.
17036 APInt Val = Elt->getAPIntValue();
17040 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17043 return DAG.getBuildVector(VT, dl, ULTOp1);
17046 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17047 SelectionDAG &DAG) {
17048 SDValue Op0 = Op.getOperand(0);
17049 SDValue Op1 = Op.getOperand(1);
17050 SDValue CC = Op.getOperand(2);
17051 MVT VT = Op.getSimpleValueType();
17052 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17053 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17058 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17059 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17063 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17064 assert(VT.getVectorNumElements() <= 16);
17065 Opc = X86ISD::CMPM;
17067 Opc = X86ISD::CMPP;
17068 // The SSE/AVX packed FP comparison nodes are defined with a
17069 // floating-point vector result that matches the operand type. This allows
17070 // them to work with an SSE1 target (integer vector types are not legal).
17071 VT = Op0.getSimpleValueType();
17074 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17075 // emit two comparisons and a logic op to tie them together.
17076 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17079 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17081 // LLVM predicate is SETUEQ or SETONE.
17083 unsigned CombineOpc;
17084 if (SetCCOpcode == ISD::SETUEQ) {
17087 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17088 static_cast<unsigned>(ISD::OR);
17090 assert(SetCCOpcode == ISD::SETONE);
17093 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17094 static_cast<unsigned>(ISD::AND);
17097 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17098 DAG.getConstant(CC0, dl, MVT::i8));
17099 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17100 DAG.getConstant(CC1, dl, MVT::i8));
17101 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17103 // Handle all other FP comparisons here.
17104 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17105 DAG.getConstant(SSECC, dl, MVT::i8));
17108 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17109 // result type of SETCC. The bitcast is expected to be optimized away
17110 // during combining/isel.
17111 if (Opc == X86ISD::CMPP)
17112 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17117 MVT VTOp0 = Op0.getSimpleValueType();
17118 assert(VTOp0 == Op1.getSimpleValueType() &&
17119 "Expected operands with same type!");
17120 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17121 "Invalid number of packed elements for source and destination!");
17123 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17124 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17125 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17126 // legalizer firstly checks if the first operand in input to the setcc has
17127 // a legal type. If so, then it promotes the return type to that same type.
17128 // Otherwise, the return type is promoted to the 'next legal type' which,
17129 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17131 // We reach this code only if the following two conditions are met:
17132 // 1. Both return type and operand type have been promoted to wider types
17133 // by the type legalizer.
17134 // 2. The original operand type has been promoted to a 256-bit vector.
17136 // Note that condition 2. only applies for AVX targets.
17137 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17138 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17141 // The non-AVX512 code below works under the assumption that source and
17142 // destination types are the same.
17143 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17144 "Value types for source and destination must be the same!");
17146 // Break 256-bit integer vector compare into smaller ones.
17147 if (VT.is256BitVector() && !Subtarget.hasInt256())
17148 return Lower256IntVSETCC(Op, DAG);
17150 // Operands are boolean (vectors of i1)
17151 MVT OpVT = Op1.getSimpleValueType();
17152 if (OpVT.getVectorElementType() == MVT::i1)
17153 return LowerBoolVSETCC_AVX512(Op, DAG);
17155 // The result is boolean, but operands are int/float
17156 if (VT.getVectorElementType() == MVT::i1) {
17157 // In AVX-512 architecture setcc returns mask with i1 elements,
17158 // But there is no compare instruction for i8 and i16 elements in KNL.
17159 // In this case use SSE compare
17160 bool UseAVX512Inst =
17161 (OpVT.is512BitVector() ||
17162 OpVT.getScalarSizeInBits() >= 32 ||
17163 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17166 return LowerIntVSETCC_AVX512(Op, DAG);
17168 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17169 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17172 // Lower using XOP integer comparisons.
17173 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17174 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17175 // Translate compare code to XOP PCOM compare mode.
17176 unsigned CmpMode = 0;
17177 switch (SetCCOpcode) {
17178 default: llvm_unreachable("Unexpected SETCC condition");
17180 case ISD::SETLT: CmpMode = 0x00; break;
17182 case ISD::SETLE: CmpMode = 0x01; break;
17184 case ISD::SETGT: CmpMode = 0x02; break;
17186 case ISD::SETGE: CmpMode = 0x03; break;
17187 case ISD::SETEQ: CmpMode = 0x04; break;
17188 case ISD::SETNE: CmpMode = 0x05; break;
17191 // Are we comparing unsigned or signed integers?
17192 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17193 ? X86ISD::VPCOMU : X86ISD::VPCOM;
17195 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17196 DAG.getConstant(CmpMode, dl, MVT::i8));
17199 // We are handling one of the integer comparisons here. Since SSE only has
17200 // GT and EQ comparisons for integer, swapping operands and multiple
17201 // operations may be required for some comparisons.
17203 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17204 bool Subus = false;
17206 switch (SetCCOpcode) {
17207 default: llvm_unreachable("Unexpected SETCC condition");
17208 case ISD::SETNE: Invert = true;
17209 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
17210 case ISD::SETLT: Swap = true;
17211 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
17212 case ISD::SETGE: Swap = true;
17213 case ISD::SETLE: Opc = X86ISD::PCMPGT;
17214 Invert = true; break;
17215 case ISD::SETULT: Swap = true;
17216 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17217 FlipSigns = true; break;
17218 case ISD::SETUGE: Swap = true;
17219 case ISD::SETULE: Opc = X86ISD::PCMPGT;
17220 FlipSigns = true; Invert = true; break;
17223 // Special case: Use min/max operations for SETULE/SETUGE
17224 MVT VET = VT.getVectorElementType();
17226 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17227 || (Subtarget.hasSSE2() && (VET == MVT::i8));
17230 switch (SetCCOpcode) {
17232 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17233 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17236 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17239 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17240 if (!MinMax && hasSubus) {
17241 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17243 // t = psubus Op0, Op1
17244 // pcmpeq t, <0..0>
17245 switch (SetCCOpcode) {
17247 case ISD::SETULT: {
17248 // If the comparison is against a constant we can turn this into a
17249 // setule. With psubus, setule does not require a swap. This is
17250 // beneficial because the constant in the register is no longer
17251 // destructed as the destination so it can be hoisted out of a loop.
17252 // Only do this pre-AVX since vpcmp* is no longer destructive.
17253 if (Subtarget.hasAVX())
17255 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17257 Subus = true; Invert = false; Swap = false;
17261 // Psubus is better than flip-sign because it requires no inversion.
17262 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17263 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17267 Opc = X86ISD::SUBUS;
17273 std::swap(Op0, Op1);
17275 // Check that the operation in question is available (most are plain SSE2,
17276 // but PCMPGTQ and PCMPEQQ have different requirements).
17277 if (VT == MVT::v2i64) {
17278 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17279 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17281 // First cast everything to the right type.
17282 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17283 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17285 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17286 // bits of the inputs before performing those operations. The lower
17287 // compare is always unsigned.
17290 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17292 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17293 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17294 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17296 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17297 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17299 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17300 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17301 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17303 // Create masks for only the low parts/high parts of the 64 bit integers.
17304 static const int MaskHi[] = { 1, 1, 3, 3 };
17305 static const int MaskLo[] = { 0, 0, 2, 2 };
17306 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17307 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17308 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17310 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17311 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17314 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17316 return DAG.getBitcast(VT, Result);
17319 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17320 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17321 // pcmpeqd + pshufd + pand.
17322 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17324 // First cast everything to the right type.
17325 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17326 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17329 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17331 // Make sure the lower and upper halves are both all-ones.
17332 static const int Mask[] = { 1, 0, 3, 2 };
17333 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17334 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17337 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17339 return DAG.getBitcast(VT, Result);
17343 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17344 // bits of the inputs before performing those operations.
17346 MVT EltVT = VT.getVectorElementType();
17347 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17349 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17350 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17353 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17355 // If the logical-not of the result is required, perform that now.
17357 Result = DAG.getNOT(dl, Result, VT);
17360 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17363 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17364 getZeroVector(VT, Subtarget, DAG, dl));
17369 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17371 MVT VT = Op.getSimpleValueType();
17373 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17375 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
17376 && "SetCC type must be 8-bit or 1-bit integer");
17377 SDValue Op0 = Op.getOperand(0);
17378 SDValue Op1 = Op.getOperand(1);
17380 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17382 // Optimize to BT if possible.
17383 // Lower (X & (1 << N)) == 0 to BT(X, N).
17384 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17385 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17386 // Lower (trunc (X >> N) to i1) to BT(X, N).
17387 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17388 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17389 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17391 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17396 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17398 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17399 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17401 // If the input is a setcc, then reuse the input setcc or use a new one with
17402 // the inverted condition.
17403 if (Op0.getOpcode() == X86ISD::SETCC) {
17404 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17405 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17409 CCode = X86::GetOppositeBranchCondition(CCode);
17410 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17412 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17416 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17417 if (isOneConstant(Op1)) {
17418 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17419 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17421 if (!isNullConstant(Op1)) {
17422 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17423 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17427 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17428 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17429 if (X86CC == X86::COND_INVALID)
17432 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17433 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17434 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17436 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17440 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17441 SDValue LHS = Op.getOperand(0);
17442 SDValue RHS = Op.getOperand(1);
17443 SDValue Carry = Op.getOperand(2);
17444 SDValue Cond = Op.getOperand(3);
17447 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17448 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17450 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17451 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17452 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17453 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17454 if (Op.getSimpleValueType() == MVT::i1)
17455 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17459 /// Return true if opcode is a X86 logical comparison.
17460 static bool isX86LogicalCmp(SDValue Op) {
17461 unsigned Opc = Op.getOpcode();
17462 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17463 Opc == X86ISD::SAHF)
17465 if (Op.getResNo() == 1 &&
17466 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17467 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17468 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17469 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17472 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17478 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17479 if (V.getOpcode() != ISD::TRUNCATE)
17482 SDValue VOp0 = V.getOperand(0);
17483 unsigned InBits = VOp0.getValueSizeInBits();
17484 unsigned Bits = V.getValueSizeInBits();
17485 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17488 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17489 bool AddTest = true;
17490 SDValue Cond = Op.getOperand(0);
17491 SDValue Op1 = Op.getOperand(1);
17492 SDValue Op2 = Op.getOperand(2);
17494 MVT VT = Op1.getSimpleValueType();
17497 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17498 // are available or VBLENDV if AVX is available.
17499 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17500 if (Cond.getOpcode() == ISD::SETCC &&
17501 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17502 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17503 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17504 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17505 int SSECC = translateX86FSETCC(
17506 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17509 if (Subtarget.hasAVX512()) {
17510 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
17511 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17512 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17513 DL, VT, Cmp, Op1, Op2);
17516 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17517 DAG.getConstant(SSECC, DL, MVT::i8));
17519 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17520 // of 3 logic instructions for size savings and potentially speed.
17521 // Unfortunately, there is no scalar form of VBLENDV.
17523 // If either operand is a constant, don't try this. We can expect to
17524 // optimize away at least one of the logic instructions later in that
17525 // case, so that sequence would be faster than a variable blend.
17527 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17528 // uses XMM0 as the selection register. That may need just as many
17529 // instructions as the AND/ANDN/OR sequence due to register moves, so
17532 if (Subtarget.hasAVX() &&
17533 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17535 // Convert to vectors, do a VSELECT, and convert back to scalar.
17536 // All of the conversions should be optimized away.
17538 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17539 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17540 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17541 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17543 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17544 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17546 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
17548 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17549 VSel, DAG.getIntPtrConstant(0, DL));
17551 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17552 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17553 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17557 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17558 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
17559 Subtarget.hasAVX512())
17560 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
17562 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17564 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17565 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17566 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17567 Op1Scalar = Op1.getOperand(0);
17569 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17570 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17571 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17572 Op2Scalar = Op2.getOperand(0);
17573 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17574 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
17575 Op1Scalar.getValueType(),
17576 Cond, Op1Scalar, Op2Scalar);
17577 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17578 return DAG.getBitcast(VT, newSelect);
17579 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17580 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17581 DAG.getIntPtrConstant(0, DL));
17585 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17586 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17587 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17588 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17589 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17590 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17591 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
17593 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17596 if (Cond.getOpcode() == ISD::SETCC) {
17597 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17599 // If the condition was updated, it's possible that the operands of the
17600 // select were also updated (for example, EmitTest has a RAUW). Refresh
17601 // the local references to the select operands in case they got stale.
17602 Op1 = Op.getOperand(1);
17603 Op2 = Op.getOperand(2);
17607 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17608 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17609 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17610 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17611 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17612 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17613 if (Cond.getOpcode() == X86ISD::SETCC &&
17614 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17615 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17616 SDValue Cmp = Cond.getOperand(1);
17617 unsigned CondCode =
17618 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17620 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17621 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17622 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17624 SDValue CmpOp0 = Cmp.getOperand(0);
17625 // Apply further optimizations for special cases
17626 // (select (x != 0), -1, 0) -> neg & sbb
17627 // (select (x == 0), 0, -1) -> neg & sbb
17628 if (isNullConstant(Y) &&
17629 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17630 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17631 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17632 DAG.getConstant(0, DL,
17633 CmpOp0.getValueType()),
17635 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17636 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17637 SDValue(Neg.getNode(), 1));
17641 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17642 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17643 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17645 SDValue Res = // Res = 0 or -1.
17646 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17647 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17649 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17650 Res = DAG.getNOT(DL, Res, Res.getValueType());
17652 if (!isNullConstant(Op2))
17653 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17655 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17656 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17657 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17658 SDValue CmpOp0 = Cmp.getOperand(0);
17659 SDValue Src1, Src2;
17660 // true if Op2 is XOR or OR operator and one of its operands
17662 // ( a , a op b) || ( b , a op b)
17663 auto isOrXorPattern = [&]() {
17664 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17665 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17667 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17674 if (isOrXorPattern()) {
17676 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17677 // we need mask of all zeros or ones with same size of the other
17679 if (CmpSz > VT.getSizeInBits())
17680 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17681 else if (CmpSz < VT.getSizeInBits())
17682 Neg = DAG.getNode(ISD::AND, DL, VT,
17683 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17684 DAG.getConstant(1, DL, VT));
17687 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17688 Neg); // -(and (x, 0x1))
17689 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17690 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17695 // Look past (and (setcc_carry (cmp ...)), 1).
17696 if (Cond.getOpcode() == ISD::AND &&
17697 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17698 isOneConstant(Cond.getOperand(1)))
17699 Cond = Cond.getOperand(0);
17701 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17702 // setting operand in place of the X86ISD::SETCC.
17703 unsigned CondOpcode = Cond.getOpcode();
17704 if (CondOpcode == X86ISD::SETCC ||
17705 CondOpcode == X86ISD::SETCC_CARRY) {
17706 CC = Cond.getOperand(0);
17708 SDValue Cmp = Cond.getOperand(1);
17709 unsigned Opc = Cmp.getOpcode();
17710 MVT VT = Op.getSimpleValueType();
17712 bool IllegalFPCMov = false;
17713 if (VT.isFloatingPoint() && !VT.isVector() &&
17714 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17715 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17717 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17718 Opc == X86ISD::BT) { // FIXME
17722 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17723 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17724 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17725 Cond.getOperand(0).getValueType() != MVT::i8)) {
17726 SDValue LHS = Cond.getOperand(0);
17727 SDValue RHS = Cond.getOperand(1);
17728 unsigned X86Opcode;
17731 switch (CondOpcode) {
17732 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17733 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17734 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17735 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17736 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17737 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17738 default: llvm_unreachable("unexpected overflowing operator");
17740 if (CondOpcode == ISD::UMULO)
17741 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17744 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17746 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17748 if (CondOpcode == ISD::UMULO)
17749 Cond = X86Op.getValue(2);
17751 Cond = X86Op.getValue(1);
17753 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17758 // Look past the truncate if the high bits are known zero.
17759 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17760 Cond = Cond.getOperand(0);
17762 // We know the result of AND is compared against zero. Try to match
17764 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17765 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17766 CC = NewSetCC.getOperand(0);
17767 Cond = NewSetCC.getOperand(1);
17774 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17775 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17778 // a < b ? -1 : 0 -> RES = ~setcc_carry
17779 // a < b ? 0 : -1 -> RES = setcc_carry
17780 // a >= b ? -1 : 0 -> RES = setcc_carry
17781 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17782 if (Cond.getOpcode() == X86ISD::SUB) {
17783 Cond = ConvertCmpIfNecessary(Cond, DAG);
17784 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17786 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17787 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17788 (isNullConstant(Op1) || isNullConstant(Op2))) {
17789 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17790 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17792 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17793 return DAG.getNOT(DL, Res, Res.getValueType());
17798 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17799 // widen the cmov and push the truncate through. This avoids introducing a new
17800 // branch during isel and doesn't add any extensions.
17801 if (Op.getValueType() == MVT::i8 &&
17802 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17803 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17804 if (T1.getValueType() == T2.getValueType() &&
17805 // Blacklist CopyFromReg to avoid partial register stalls.
17806 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17807 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17808 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17809 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17813 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17814 // condition is true.
17815 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17816 SDValue Ops[] = { Op2, Op1, CC, Cond };
17817 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17820 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17821 const X86Subtarget &Subtarget,
17822 SelectionDAG &DAG) {
17823 MVT VT = Op->getSimpleValueType(0);
17824 SDValue In = Op->getOperand(0);
17825 MVT InVT = In.getSimpleValueType();
17826 MVT VTElt = VT.getVectorElementType();
17827 MVT InVTElt = InVT.getVectorElementType();
17831 if ((InVTElt == MVT::i1) &&
17832 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17834 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17836 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17838 unsigned NumElts = VT.getVectorNumElements();
17840 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17841 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17842 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17843 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17844 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17847 if (InVTElt != MVT::i1)
17851 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17852 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17855 if (Subtarget.hasDQI()) {
17856 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17857 assert(!VT.is512BitVector() && "Unexpected vector type");
17859 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17860 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17861 V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17866 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17869 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17870 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17871 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17872 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17873 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17874 const X86Subtarget &Subtarget,
17875 SelectionDAG &DAG) {
17876 SDValue In = Op->getOperand(0);
17877 MVT VT = Op->getSimpleValueType(0);
17878 MVT InVT = In.getSimpleValueType();
17879 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17881 MVT SVT = VT.getVectorElementType();
17882 MVT InSVT = InVT.getVectorElementType();
17883 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17885 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17887 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17889 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17890 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17891 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17896 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17897 // For 512-bit vectors, we need 128-bits or 256-bits.
17898 if (VT.getSizeInBits() > 128) {
17899 // Input needs to be at least the same number of elements as output, and
17900 // at least 128-bits.
17901 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17902 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17905 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17906 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17908 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17909 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17910 // need to be handled here for 256/512-bit results.
17911 if (Subtarget.hasInt256()) {
17912 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17913 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17914 X86ISD::VSEXT : X86ISD::VZEXT;
17915 return DAG.getNode(ExtOpc, dl, VT, In);
17918 // We should only get here for sign extend.
17919 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17920 "Unexpected opcode!");
17922 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17926 // As SRAI is only available on i16/i32 types, we expand only up to i32
17927 // and handle i64 separately.
17928 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17929 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17930 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17931 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17932 Curr = DAG.getBitcast(CurrVT, Curr);
17935 SDValue SignExt = Curr;
17936 if (CurrVT != InVT) {
17937 unsigned SignExtShift =
17938 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17939 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17940 DAG.getConstant(SignExtShift, dl, MVT::i8));
17946 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17947 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17948 DAG.getConstant(31, dl, MVT::i8));
17949 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17950 return DAG.getBitcast(VT, Ext);
17956 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17957 SelectionDAG &DAG) {
17958 MVT VT = Op->getSimpleValueType(0);
17959 SDValue In = Op->getOperand(0);
17960 MVT InVT = In.getSimpleValueType();
17963 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17964 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17966 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17967 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17968 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17971 if (Subtarget.hasInt256())
17972 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17974 // Optimize vectors in AVX mode
17975 // Sign extend v8i16 to v8i32 and
17978 // Divide input vector into two parts
17979 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17980 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17981 // concat the vectors to original VT
17983 unsigned NumElems = InVT.getVectorNumElements();
17984 SDValue Undef = DAG.getUNDEF(InVT);
17986 SmallVector<int,8> ShufMask1(NumElems, -1);
17987 for (unsigned i = 0; i != NumElems/2; ++i)
17990 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17992 SmallVector<int,8> ShufMask2(NumElems, -1);
17993 for (unsigned i = 0; i != NumElems/2; ++i)
17994 ShufMask2[i] = i + NumElems/2;
17996 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17998 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17999 VT.getVectorNumElements() / 2);
18001 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18002 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18004 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18007 // Lower truncating store. We need a special lowering to vXi1 vectors
18008 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18009 SelectionDAG &DAG) {
18010 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18012 EVT MemVT = St->getMemoryVT();
18013 assert(St->isTruncatingStore() && "We only custom truncating store.");
18014 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18015 "Expected truncstore of i1 vector");
18017 SDValue Op = St->getValue();
18018 MVT OpVT = Op.getValueType().getSimpleVT();
18019 unsigned NumElts = OpVT.getVectorNumElements();
18020 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18022 // Truncate and store - everything is legal
18023 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18024 if (MemVT.getSizeInBits() < 8)
18025 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18026 DAG.getUNDEF(MVT::v8i1), Op,
18027 DAG.getIntPtrConstant(0, dl));
18028 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18029 St->getMemOperand());
18032 // A subset, assume that we have only AVX-512F
18033 if (NumElts <= 8) {
18035 // Extend to 8-elts vector
18036 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18037 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18038 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18040 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18041 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18042 St->getMemOperand());
18045 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18046 // Divide the vector into 2 parts and store each part separately
18047 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18048 DAG.getIntPtrConstant(0, dl));
18049 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18050 SDValue BasePtr = St->getBasePtr();
18051 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18052 St->getMemOperand());
18053 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18054 DAG.getIntPtrConstant(16, dl));
18055 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18057 SDValue BasePtrHi =
18058 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18059 DAG.getConstant(2, dl, BasePtr.getValueType()));
18061 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18062 BasePtrHi, St->getMemOperand());
18063 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18066 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18067 const X86Subtarget &Subtarget,
18068 SelectionDAG &DAG) {
18070 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18072 EVT MemVT = Ld->getMemoryVT();
18073 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18074 "Expected i1 vector load");
18075 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18076 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18077 MVT VT = Op.getValueType().getSimpleVT();
18078 unsigned NumElts = VT.getVectorNumElements();
18080 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18081 (Subtarget.hasDQI() && NumElts < 16) ||
18083 // Load and extend - everything is legal
18085 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18087 Ld->getMemOperand());
18088 // Replace chain users with the new chain.
18089 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18090 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18091 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18092 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18094 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18095 DAG.getIntPtrConstant(0, dl));
18097 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18099 Ld->getMemOperand());
18100 // Replace chain users with the new chain.
18101 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18102 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18104 // Finally, do a normal sign-extend to the desired register.
18105 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18108 if (NumElts <= 8) {
18109 // A subset, assume that we have only AVX-512F
18110 unsigned NumBitsToLoad = 8;
18111 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18112 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18114 Ld->getMemOperand());
18115 // Replace chain users with the new chain.
18116 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18117 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18119 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18120 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18123 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18125 // we should take care to v4i1 and v2i1
18127 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18128 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18129 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18130 DAG.getIntPtrConstant(0, dl));
18133 assert(VT == MVT::v32i8 && "Unexpected extload type");
18135 SmallVector<SDValue, 2> Chains;
18137 SDValue BasePtr = Ld->getBasePtr();
18138 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18140 Ld->getMemOperand());
18141 Chains.push_back(LoadLo.getValue(1));
18143 SDValue BasePtrHi =
18144 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18145 DAG.getConstant(2, dl, BasePtr.getValueType()));
18147 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18149 Ld->getMemOperand());
18150 Chains.push_back(LoadHi.getValue(1));
18151 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18152 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18154 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18155 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18156 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18159 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18160 // may emit an illegal shuffle but the expansion is still better than scalar
18161 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18162 // we'll emit a shuffle and a arithmetic shift.
18163 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18164 // TODO: It is possible to support ZExt by zeroing the undef values during
18165 // the shuffle phase or after the shuffle.
18166 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18167 SelectionDAG &DAG) {
18168 MVT RegVT = Op.getSimpleValueType();
18169 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18170 assert(RegVT.isInteger() &&
18171 "We only custom lower integer vector sext loads.");
18173 // Nothing useful we can do without SSE2 shuffles.
18174 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18176 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18178 EVT MemVT = Ld->getMemoryVT();
18179 if (MemVT.getScalarType() == MVT::i1)
18180 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18182 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18183 unsigned RegSz = RegVT.getSizeInBits();
18185 ISD::LoadExtType Ext = Ld->getExtensionType();
18187 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18188 && "Only anyext and sext are currently implemented.");
18189 assert(MemVT != RegVT && "Cannot extend to the same type");
18190 assert(MemVT.isVector() && "Must load a vector from memory");
18192 unsigned NumElems = RegVT.getVectorNumElements();
18193 unsigned MemSz = MemVT.getSizeInBits();
18194 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18196 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18197 // The only way in which we have a legal 256-bit vector result but not the
18198 // integer 256-bit operations needed to directly lower a sextload is if we
18199 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18200 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18201 // correctly legalized. We do this late to allow the canonical form of
18202 // sextload to persist throughout the rest of the DAG combiner -- it wants
18203 // to fold together any extensions it can, and so will fuse a sign_extend
18204 // of an sextload into a sextload targeting a wider value.
18206 if (MemSz == 128) {
18207 // Just switch this to a normal load.
18208 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18209 "it must be a legal 128-bit vector "
18211 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18212 Ld->getPointerInfo(), Ld->getAlignment(),
18213 Ld->getMemOperand()->getFlags());
18215 assert(MemSz < 128 &&
18216 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18217 // Do an sext load to a 128-bit vector type. We want to use the same
18218 // number of elements, but elements half as wide. This will end up being
18219 // recursively lowered by this routine, but will succeed as we definitely
18220 // have all the necessary features if we're using AVX1.
18222 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18223 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18225 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18226 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18227 Ld->getMemOperand()->getFlags());
18230 // Replace chain users with the new chain.
18231 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18232 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18234 // Finally, do a normal sign-extend to the desired register.
18235 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18238 // All sizes must be a power of two.
18239 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18240 "Non-power-of-two elements are not custom lowered!");
18242 // Attempt to load the original value using scalar loads.
18243 // Find the largest scalar type that divides the total loaded size.
18244 MVT SclrLoadTy = MVT::i8;
18245 for (MVT Tp : MVT::integer_valuetypes()) {
18246 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18251 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18252 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18254 SclrLoadTy = MVT::f64;
18256 // Calculate the number of scalar loads that we need to perform
18257 // in order to load our vector from memory.
18258 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18260 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18261 "Can only lower sext loads with a single scalar load!");
18263 unsigned loadRegZize = RegSz;
18264 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18267 // Represent our vector as a sequence of elements which are the
18268 // largest scalar that we can load.
18269 EVT LoadUnitVecVT = EVT::getVectorVT(
18270 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18272 // Represent the data using the same element type that is stored in
18273 // memory. In practice, we ''widen'' MemVT.
18275 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18276 loadRegZize / MemVT.getScalarSizeInBits());
18278 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18279 "Invalid vector type");
18281 // We can't shuffle using an illegal type.
18282 assert(TLI.isTypeLegal(WideVecVT) &&
18283 "We only lower types that form legal widened vector types");
18285 SmallVector<SDValue, 8> Chains;
18286 SDValue Ptr = Ld->getBasePtr();
18287 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18288 TLI.getPointerTy(DAG.getDataLayout()));
18289 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18291 for (unsigned i = 0; i < NumLoads; ++i) {
18292 // Perform a single load.
18293 SDValue ScalarLoad =
18294 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18295 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18296 Chains.push_back(ScalarLoad.getValue(1));
18297 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18298 // another round of DAGCombining.
18300 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18302 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18303 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18305 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18308 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18310 // Bitcast the loaded value to a vector of the original element type, in
18311 // the size of the target vector type.
18312 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18313 unsigned SizeRatio = RegSz / MemSz;
18315 if (Ext == ISD::SEXTLOAD) {
18316 // If we have SSE4.1, we can directly emit a VSEXT node.
18317 if (Subtarget.hasSSE41()) {
18318 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18319 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18323 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18325 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18326 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18328 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18329 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18333 // Redistribute the loaded elements into the different locations.
18334 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18335 for (unsigned i = 0; i != NumElems; ++i)
18336 ShuffleVec[i * SizeRatio] = i;
18338 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18339 DAG.getUNDEF(WideVecVT), ShuffleVec);
18341 // Bitcast to the requested type.
18342 Shuff = DAG.getBitcast(RegVT, Shuff);
18343 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18347 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18348 /// each of which has no other use apart from the AND / OR.
18349 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18350 Opc = Op.getOpcode();
18351 if (Opc != ISD::OR && Opc != ISD::AND)
18353 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18354 Op.getOperand(0).hasOneUse() &&
18355 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18356 Op.getOperand(1).hasOneUse());
18359 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18360 /// SETCC node has a single use.
18361 static bool isXor1OfSetCC(SDValue Op) {
18362 if (Op.getOpcode() != ISD::XOR)
18364 if (isOneConstant(Op.getOperand(1)))
18365 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18366 Op.getOperand(0).hasOneUse();
18370 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18371 bool addTest = true;
18372 SDValue Chain = Op.getOperand(0);
18373 SDValue Cond = Op.getOperand(1);
18374 SDValue Dest = Op.getOperand(2);
18377 bool Inverted = false;
18379 if (Cond.getOpcode() == ISD::SETCC) {
18380 // Check for setcc([su]{add,sub,mul}o == 0).
18381 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18382 isNullConstant(Cond.getOperand(1)) &&
18383 Cond.getOperand(0).getResNo() == 1 &&
18384 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18385 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18386 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18387 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18388 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18389 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18391 Cond = Cond.getOperand(0);
18393 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18398 // FIXME: LowerXALUO doesn't handle these!!
18399 else if (Cond.getOpcode() == X86ISD::ADD ||
18400 Cond.getOpcode() == X86ISD::SUB ||
18401 Cond.getOpcode() == X86ISD::SMUL ||
18402 Cond.getOpcode() == X86ISD::UMUL)
18403 Cond = LowerXALUO(Cond, DAG);
18406 // Look pass (and (setcc_carry (cmp ...)), 1).
18407 if (Cond.getOpcode() == ISD::AND &&
18408 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18409 isOneConstant(Cond.getOperand(1)))
18410 Cond = Cond.getOperand(0);
18412 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18413 // setting operand in place of the X86ISD::SETCC.
18414 unsigned CondOpcode = Cond.getOpcode();
18415 if (CondOpcode == X86ISD::SETCC ||
18416 CondOpcode == X86ISD::SETCC_CARRY) {
18417 CC = Cond.getOperand(0);
18419 SDValue Cmp = Cond.getOperand(1);
18420 unsigned Opc = Cmp.getOpcode();
18421 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18422 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18426 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18430 // These can only come from an arithmetic instruction with overflow,
18431 // e.g. SADDO, UADDO.
18432 Cond = Cond.getOperand(1);
18438 CondOpcode = Cond.getOpcode();
18439 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18440 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18441 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18442 Cond.getOperand(0).getValueType() != MVT::i8)) {
18443 SDValue LHS = Cond.getOperand(0);
18444 SDValue RHS = Cond.getOperand(1);
18445 unsigned X86Opcode;
18448 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18449 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18451 switch (CondOpcode) {
18452 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18454 if (isOneConstant(RHS)) {
18455 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18458 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18459 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18461 if (isOneConstant(RHS)) {
18462 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18465 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18466 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18467 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18468 default: llvm_unreachable("unexpected overflowing operator");
18471 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18472 if (CondOpcode == ISD::UMULO)
18473 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18476 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18478 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18480 if (CondOpcode == ISD::UMULO)
18481 Cond = X86Op.getValue(2);
18483 Cond = X86Op.getValue(1);
18485 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18489 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18490 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18491 if (CondOpc == ISD::OR) {
18492 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18493 // two branches instead of an explicit OR instruction with a
18495 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18496 isX86LogicalCmp(Cmp)) {
18497 CC = Cond.getOperand(0).getOperand(0);
18498 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18499 Chain, Dest, CC, Cmp);
18500 CC = Cond.getOperand(1).getOperand(0);
18504 } else { // ISD::AND
18505 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18506 // two branches instead of an explicit AND instruction with a
18507 // separate test. However, we only do this if this block doesn't
18508 // have a fall-through edge, because this requires an explicit
18509 // jmp when the condition is false.
18510 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18511 isX86LogicalCmp(Cmp) &&
18512 Op.getNode()->hasOneUse()) {
18513 X86::CondCode CCode =
18514 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18515 CCode = X86::GetOppositeBranchCondition(CCode);
18516 CC = DAG.getConstant(CCode, dl, MVT::i8);
18517 SDNode *User = *Op.getNode()->use_begin();
18518 // Look for an unconditional branch following this conditional branch.
18519 // We need this because we need to reverse the successors in order
18520 // to implement FCMP_OEQ.
18521 if (User->getOpcode() == ISD::BR) {
18522 SDValue FalseBB = User->getOperand(1);
18524 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18525 assert(NewBR == User);
18529 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18530 Chain, Dest, CC, Cmp);
18531 X86::CondCode CCode =
18532 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18533 CCode = X86::GetOppositeBranchCondition(CCode);
18534 CC = DAG.getConstant(CCode, dl, MVT::i8);
18540 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18541 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18542 // It should be transformed during dag combiner except when the condition
18543 // is set by a arithmetics with overflow node.
18544 X86::CondCode CCode =
18545 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18546 CCode = X86::GetOppositeBranchCondition(CCode);
18547 CC = DAG.getConstant(CCode, dl, MVT::i8);
18548 Cond = Cond.getOperand(0).getOperand(1);
18550 } else if (Cond.getOpcode() == ISD::SETCC &&
18551 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18552 // For FCMP_OEQ, we can emit
18553 // two branches instead of an explicit AND instruction with a
18554 // separate test. However, we only do this if this block doesn't
18555 // have a fall-through edge, because this requires an explicit
18556 // jmp when the condition is false.
18557 if (Op.getNode()->hasOneUse()) {
18558 SDNode *User = *Op.getNode()->use_begin();
18559 // Look for an unconditional branch following this conditional branch.
18560 // We need this because we need to reverse the successors in order
18561 // to implement FCMP_OEQ.
18562 if (User->getOpcode() == ISD::BR) {
18563 SDValue FalseBB = User->getOperand(1);
18565 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18566 assert(NewBR == User);
18570 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18571 Cond.getOperand(0), Cond.getOperand(1));
18572 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18573 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18574 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18575 Chain, Dest, CC, Cmp);
18576 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18581 } else if (Cond.getOpcode() == ISD::SETCC &&
18582 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18583 // For FCMP_UNE, we can emit
18584 // two branches instead of an explicit AND instruction with a
18585 // separate test. However, we only do this if this block doesn't
18586 // have a fall-through edge, because this requires an explicit
18587 // jmp when the condition is false.
18588 if (Op.getNode()->hasOneUse()) {
18589 SDNode *User = *Op.getNode()->use_begin();
18590 // Look for an unconditional branch following this conditional branch.
18591 // We need this because we need to reverse the successors in order
18592 // to implement FCMP_UNE.
18593 if (User->getOpcode() == ISD::BR) {
18594 SDValue FalseBB = User->getOperand(1);
18596 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18597 assert(NewBR == User);
18600 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18601 Cond.getOperand(0), Cond.getOperand(1));
18602 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18603 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18604 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18605 Chain, Dest, CC, Cmp);
18606 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18616 // Look pass the truncate if the high bits are known zero.
18617 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18618 Cond = Cond.getOperand(0);
18620 // We know the result is compared against zero. Try to match it to BT.
18621 if (Cond.hasOneUse()) {
18622 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18623 CC = NewSetCC.getOperand(0);
18624 Cond = NewSetCC.getOperand(1);
18631 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18632 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18633 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18635 Cond = ConvertCmpIfNecessary(Cond, DAG);
18636 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18637 Chain, Dest, CC, Cond);
18640 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18641 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18642 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18643 // that the guard pages used by the OS virtual memory manager are allocated in
18644 // correct sequence.
18646 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18647 SelectionDAG &DAG) const {
18648 MachineFunction &MF = DAG.getMachineFunction();
18649 bool SplitStack = MF.shouldSplitStack();
18650 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18655 SDNode *Node = Op.getNode();
18656 SDValue Chain = Op.getOperand(0);
18657 SDValue Size = Op.getOperand(1);
18658 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18659 EVT VT = Node->getValueType(0);
18661 // Chain the dynamic stack allocation so that it doesn't modify the stack
18662 // pointer when other instructions are using the stack.
18663 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18665 bool Is64Bit = Subtarget.is64Bit();
18666 MVT SPTy = getPointerTy(DAG.getDataLayout());
18670 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18671 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18672 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18673 " not tell us which reg is the stack pointer!");
18675 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18676 Chain = SP.getValue(1);
18677 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18678 unsigned StackAlign = TFI.getStackAlignment();
18679 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18680 if (Align > StackAlign)
18681 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18682 DAG.getConstant(-(uint64_t)Align, dl, VT));
18683 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18684 } else if (SplitStack) {
18685 MachineRegisterInfo &MRI = MF.getRegInfo();
18688 // The 64 bit implementation of segmented stacks needs to clobber both r10
18689 // r11. This makes it impossible to use it along with nested parameters.
18690 const Function *F = MF.getFunction();
18691 for (const auto &A : F->args()) {
18692 if (A.hasNestAttr())
18693 report_fatal_error("Cannot use segmented stacks with functions that "
18694 "have nested arguments.");
18698 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18699 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18700 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18701 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18702 DAG.getRegister(Vreg, SPTy));
18704 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18705 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18706 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18708 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18709 unsigned SPReg = RegInfo->getStackRegister();
18710 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18711 Chain = SP.getValue(1);
18714 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18715 DAG.getConstant(-(uint64_t)Align, dl, VT));
18716 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18722 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18723 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18725 SDValue Ops[2] = {Result, Chain};
18726 return DAG.getMergeValues(Ops, dl);
18729 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18730 MachineFunction &MF = DAG.getMachineFunction();
18731 auto PtrVT = getPointerTy(MF.getDataLayout());
18732 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18734 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18737 if (!Subtarget.is64Bit() ||
18738 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18739 // vastart just stores the address of the VarArgsFrameIndex slot into the
18740 // memory location argument.
18741 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18742 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18743 MachinePointerInfo(SV));
18747 // gp_offset (0 - 6 * 8)
18748 // fp_offset (48 - 48 + 8 * 16)
18749 // overflow_arg_area (point to parameters coming in memory).
18751 SmallVector<SDValue, 8> MemOps;
18752 SDValue FIN = Op.getOperand(1);
18754 SDValue Store = DAG.getStore(
18755 Op.getOperand(0), DL,
18756 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18757 MachinePointerInfo(SV));
18758 MemOps.push_back(Store);
18761 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18762 Store = DAG.getStore(
18763 Op.getOperand(0), DL,
18764 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18765 MachinePointerInfo(SV, 4));
18766 MemOps.push_back(Store);
18768 // Store ptr to overflow_arg_area
18769 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18770 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18772 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18773 MemOps.push_back(Store);
18775 // Store ptr to reg_save_area.
18776 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18777 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18778 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18779 Store = DAG.getStore(
18780 Op.getOperand(0), DL, RSFIN, FIN,
18781 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18782 MemOps.push_back(Store);
18783 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18786 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18787 assert(Subtarget.is64Bit() &&
18788 "LowerVAARG only handles 64-bit va_arg!");
18789 assert(Op.getNumOperands() == 4);
18791 MachineFunction &MF = DAG.getMachineFunction();
18792 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18793 // The Win64 ABI uses char* instead of a structure.
18794 return DAG.expandVAArg(Op.getNode());
18796 SDValue Chain = Op.getOperand(0);
18797 SDValue SrcPtr = Op.getOperand(1);
18798 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18799 unsigned Align = Op.getConstantOperandVal(3);
18802 EVT ArgVT = Op.getNode()->getValueType(0);
18803 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18804 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18807 // Decide which area this value should be read from.
18808 // TODO: Implement the AMD64 ABI in its entirety. This simple
18809 // selection mechanism works only for the basic types.
18810 if (ArgVT == MVT::f80) {
18811 llvm_unreachable("va_arg for f80 not yet implemented");
18812 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18813 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18814 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18815 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18817 llvm_unreachable("Unhandled argument type in LowerVAARG");
18820 if (ArgMode == 2) {
18821 // Sanity Check: Make sure using fp_offset makes sense.
18822 assert(!Subtarget.useSoftFloat() &&
18823 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18824 Subtarget.hasSSE1());
18827 // Insert VAARG_64 node into the DAG
18828 // VAARG_64 returns two values: Variable Argument Address, Chain
18829 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18830 DAG.getConstant(ArgMode, dl, MVT::i8),
18831 DAG.getConstant(Align, dl, MVT::i32)};
18832 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18833 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18834 VTs, InstOps, MVT::i64,
18835 MachinePointerInfo(SV),
18837 /*Volatile=*/false,
18839 /*WriteMem=*/true);
18840 Chain = VAARG.getValue(1);
18842 // Load the next argument and return it
18843 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18846 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18847 SelectionDAG &DAG) {
18848 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18849 // where a va_list is still an i8*.
18850 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18851 if (Subtarget.isCallingConvWin64(
18852 DAG.getMachineFunction().getFunction()->getCallingConv()))
18853 // Probably a Win64 va_copy.
18854 return DAG.expandVACopy(Op.getNode());
18856 SDValue Chain = Op.getOperand(0);
18857 SDValue DstPtr = Op.getOperand(1);
18858 SDValue SrcPtr = Op.getOperand(2);
18859 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18860 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18863 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18864 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18866 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18869 /// Handle vector element shifts where the shift amount is a constant.
18870 /// Takes immediate version of shift as input.
18871 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18872 SDValue SrcOp, uint64_t ShiftAmt,
18873 SelectionDAG &DAG) {
18874 MVT ElementType = VT.getVectorElementType();
18876 // Bitcast the source vector to the output type, this is mainly necessary for
18877 // vXi8/vXi64 shifts.
18878 if (VT != SrcOp.getSimpleValueType())
18879 SrcOp = DAG.getBitcast(VT, SrcOp);
18881 // Fold this packed shift into its first operand if ShiftAmt is 0.
18885 // Check for ShiftAmt >= element width
18886 if (ShiftAmt >= ElementType.getSizeInBits()) {
18887 if (Opc == X86ISD::VSRAI)
18888 ShiftAmt = ElementType.getSizeInBits() - 1;
18890 return DAG.getConstant(0, dl, VT);
18893 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18894 && "Unknown target vector shift-by-constant node");
18896 // Fold this packed vector shift into a build vector if SrcOp is a
18897 // vector of Constants or UNDEFs.
18898 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18899 SmallVector<SDValue, 8> Elts;
18900 unsigned NumElts = SrcOp->getNumOperands();
18901 ConstantSDNode *ND;
18904 default: llvm_unreachable("Unknown opcode!");
18905 case X86ISD::VSHLI:
18906 for (unsigned i=0; i!=NumElts; ++i) {
18907 SDValue CurrentOp = SrcOp->getOperand(i);
18908 if (CurrentOp->isUndef()) {
18909 Elts.push_back(CurrentOp);
18912 ND = cast<ConstantSDNode>(CurrentOp);
18913 const APInt &C = ND->getAPIntValue();
18914 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18917 case X86ISD::VSRLI:
18918 for (unsigned i=0; i!=NumElts; ++i) {
18919 SDValue CurrentOp = SrcOp->getOperand(i);
18920 if (CurrentOp->isUndef()) {
18921 Elts.push_back(CurrentOp);
18924 ND = cast<ConstantSDNode>(CurrentOp);
18925 const APInt &C = ND->getAPIntValue();
18926 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18929 case X86ISD::VSRAI:
18930 for (unsigned i=0; i!=NumElts; ++i) {
18931 SDValue CurrentOp = SrcOp->getOperand(i);
18932 if (CurrentOp->isUndef()) {
18933 Elts.push_back(CurrentOp);
18936 ND = cast<ConstantSDNode>(CurrentOp);
18937 const APInt &C = ND->getAPIntValue();
18938 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18943 return DAG.getBuildVector(VT, dl, Elts);
18946 return DAG.getNode(Opc, dl, VT, SrcOp,
18947 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18950 /// Handle vector element shifts where the shift amount may or may not be a
18951 /// constant. Takes immediate version of shift as input.
18952 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18953 SDValue SrcOp, SDValue ShAmt,
18954 const X86Subtarget &Subtarget,
18955 SelectionDAG &DAG) {
18956 MVT SVT = ShAmt.getSimpleValueType();
18957 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18959 // Catch shift-by-constant.
18960 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18961 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18962 CShAmt->getZExtValue(), DAG);
18964 // Change opcode to non-immediate version
18966 default: llvm_unreachable("Unknown target vector shift node");
18967 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18968 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18969 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18972 // Need to build a vector containing shift amount.
18973 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18974 // +=================+============+=======================================+
18975 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18976 // +=================+============+=======================================+
18977 // | i64 | Yes, No | Use ShAmt as lowest elt |
18978 // | i32 | Yes | zero-extend in-reg |
18979 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18980 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18981 // +=================+============+=======================================+
18983 if (SVT == MVT::i64)
18984 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18985 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18986 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18987 ShAmt = ShAmt.getOperand(0);
18988 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18989 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18990 } else if (Subtarget.hasSSE41() &&
18991 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18992 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18993 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18995 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18996 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18997 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19000 // The return type has to be a 128-bit type with the same element
19001 // type as the input type.
19002 MVT EltVT = VT.getVectorElementType();
19003 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19005 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19006 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19009 /// \brief Return Mask with the necessary casting or extending
19010 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19011 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19012 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19015 if (isAllOnesConstant(Mask))
19016 return DAG.getTargetConstant(1, dl, MaskVT);
19017 if (X86::isZeroNode(Mask))
19018 return DAG.getTargetConstant(0, dl, MaskVT);
19020 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19021 // Mask should be extended
19022 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19023 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19026 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19027 if (MaskVT == MVT::v64i1) {
19028 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19029 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19031 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19032 DAG.getConstant(0, dl, MVT::i32));
19033 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19034 DAG.getConstant(1, dl, MVT::i32));
19036 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19037 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19039 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19041 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19043 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19044 return DAG.getBitcast(MaskVT,
19045 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19049 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19050 Mask.getSimpleValueType().getSizeInBits());
19051 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19052 // are extracted by EXTRACT_SUBVECTOR.
19053 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19054 DAG.getBitcast(BitcastVT, Mask),
19055 DAG.getIntPtrConstant(0, dl));
19059 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19060 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19061 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19062 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19063 SDValue PreservedSrc,
19064 const X86Subtarget &Subtarget,
19065 SelectionDAG &DAG) {
19066 MVT VT = Op.getSimpleValueType();
19067 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19068 unsigned OpcodeSelect = ISD::VSELECT;
19071 if (isAllOnesConstant(Mask))
19074 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19076 switch (Op.getOpcode()) {
19078 case X86ISD::PCMPEQM:
19079 case X86ISD::PCMPGTM:
19081 case X86ISD::CMPMU:
19082 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19083 case X86ISD::VFPCLASS:
19084 case X86ISD::VFPCLASSS:
19085 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19086 case X86ISD::VTRUNC:
19087 case X86ISD::VTRUNCS:
19088 case X86ISD::VTRUNCUS:
19089 case X86ISD::CVTPS2PH:
19090 // We can't use ISD::VSELECT here because it is not always "Legal"
19091 // for the destination type. For example vpmovqb require only AVX512
19092 // and vselect that can operate on byte element type require BWI
19093 OpcodeSelect = X86ISD::SELECT;
19096 if (PreservedSrc.isUndef())
19097 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19098 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19101 /// \brief Creates an SDNode for a predicated scalar operation.
19102 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19103 /// The mask is coming as MVT::i8 and it should be truncated
19104 /// to MVT::i1 while lowering masking intrinsics.
19105 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19106 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19107 /// for a scalar instruction.
19108 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19109 SDValue PreservedSrc,
19110 const X86Subtarget &Subtarget,
19111 SelectionDAG &DAG) {
19112 if (isAllOnesConstant(Mask))
19115 MVT VT = Op.getSimpleValueType();
19117 // The mask should be of type MVT::i1
19118 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
19120 if (Op.getOpcode() == X86ISD::FSETCCM ||
19121 Op.getOpcode() == X86ISD::FSETCCM_RND)
19122 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19123 if (Op.getOpcode() == X86ISD::VFPCLASS ||
19124 Op.getOpcode() == X86ISD::VFPCLASSS)
19125 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19127 if (PreservedSrc.isUndef())
19128 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19129 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19132 static int getSEHRegistrationNodeSize(const Function *Fn) {
19133 if (!Fn->hasPersonalityFn())
19134 report_fatal_error(
19135 "querying registration node size for function without personality");
19136 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19137 // WinEHStatePass for the full struct definition.
19138 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19139 case EHPersonality::MSVC_X86SEH: return 24;
19140 case EHPersonality::MSVC_CXX: return 16;
19143 report_fatal_error(
19144 "can only recover FP for 32-bit MSVC EH personality functions");
19147 /// When the MSVC runtime transfers control to us, either to an outlined
19148 /// function or when returning to a parent frame after catching an exception, we
19149 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19150 /// Here's the math:
19151 /// RegNodeBase = EntryEBP - RegNodeSize
19152 /// ParentFP = RegNodeBase - ParentFrameOffset
19153 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19154 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19155 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19156 SDValue EntryEBP) {
19157 MachineFunction &MF = DAG.getMachineFunction();
19160 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19161 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19163 // It's possible that the parent function no longer has a personality function
19164 // if the exceptional code was optimized away, in which case we just return
19165 // the incoming EBP.
19166 if (!Fn->hasPersonalityFn())
19169 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19170 // registration, or the .set_setframe offset.
19171 MCSymbol *OffsetSym =
19172 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19173 GlobalValue::getRealLinkageName(Fn->getName()));
19174 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19175 SDValue ParentFrameOffset =
19176 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19178 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19179 // prologue to RBP in the parent function.
19180 const X86Subtarget &Subtarget =
19181 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19182 if (Subtarget.is64Bit())
19183 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19185 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19186 // RegNodeBase = EntryEBP - RegNodeSize
19187 // ParentFP = RegNodeBase - ParentFrameOffset
19188 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19189 DAG.getConstant(RegNodeSize, dl, PtrVT));
19190 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19193 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19194 SelectionDAG &DAG) {
19195 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19196 auto isRoundModeCurDirection = [](SDValue Rnd) {
19197 if (!isa<ConstantSDNode>(Rnd))
19200 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19201 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19205 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19206 MVT VT = Op.getSimpleValueType();
19207 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19209 switch(IntrData->Type) {
19210 case INTR_TYPE_1OP:
19211 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19212 case INTR_TYPE_2OP:
19213 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19215 case INTR_TYPE_3OP:
19216 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19217 Op.getOperand(2), Op.getOperand(3));
19218 case INTR_TYPE_4OP:
19219 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19220 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19221 case INTR_TYPE_1OP_MASK_RM: {
19222 SDValue Src = Op.getOperand(1);
19223 SDValue PassThru = Op.getOperand(2);
19224 SDValue Mask = Op.getOperand(3);
19225 SDValue RoundingMode;
19226 // We always add rounding mode to the Node.
19227 // If the rounding mode is not specified, we add the
19228 // "current direction" mode.
19229 if (Op.getNumOperands() == 4)
19231 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19233 RoundingMode = Op.getOperand(4);
19234 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19235 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19237 Mask, PassThru, Subtarget, DAG);
19239 case INTR_TYPE_1OP_MASK: {
19240 SDValue Src = Op.getOperand(1);
19241 SDValue PassThru = Op.getOperand(2);
19242 SDValue Mask = Op.getOperand(3);
19243 // We add rounding mode to the Node when
19244 // - RM Opcode is specified and
19245 // - RM is not "current direction".
19246 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19247 if (IntrWithRoundingModeOpcode != 0) {
19248 SDValue Rnd = Op.getOperand(4);
19249 if (!isRoundModeCurDirection(Rnd)) {
19250 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19251 dl, Op.getValueType(),
19253 Mask, PassThru, Subtarget, DAG);
19256 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19257 Mask, PassThru, Subtarget, DAG);
19259 case INTR_TYPE_SCALAR_MASK: {
19260 SDValue Src1 = Op.getOperand(1);
19261 SDValue Src2 = Op.getOperand(2);
19262 SDValue passThru = Op.getOperand(3);
19263 SDValue Mask = Op.getOperand(4);
19264 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19265 if (IntrWithRoundingModeOpcode != 0) {
19266 SDValue Rnd = Op.getOperand(5);
19267 if (!isRoundModeCurDirection(Rnd))
19268 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19269 dl, VT, Src1, Src2, Rnd),
19270 Mask, passThru, Subtarget, DAG);
19272 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19273 Mask, passThru, Subtarget, DAG);
19275 case INTR_TYPE_SCALAR_MASK_RM: {
19276 SDValue Src1 = Op.getOperand(1);
19277 SDValue Src2 = Op.getOperand(2);
19278 SDValue Src0 = Op.getOperand(3);
19279 SDValue Mask = Op.getOperand(4);
19280 // There are 2 kinds of intrinsics in this group:
19281 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19282 // (2) With rounding mode and sae - 7 operands.
19283 if (Op.getNumOperands() == 6) {
19284 SDValue Sae = Op.getOperand(5);
19285 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19287 Mask, Src0, Subtarget, DAG);
19289 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19290 SDValue RoundingMode = Op.getOperand(5);
19291 SDValue Sae = Op.getOperand(6);
19292 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19293 RoundingMode, Sae),
19294 Mask, Src0, Subtarget, DAG);
19296 case INTR_TYPE_2OP_MASK:
19297 case INTR_TYPE_2OP_IMM8_MASK: {
19298 SDValue Src1 = Op.getOperand(1);
19299 SDValue Src2 = Op.getOperand(2);
19300 SDValue PassThru = Op.getOperand(3);
19301 SDValue Mask = Op.getOperand(4);
19303 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19304 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19306 // We specify 2 possible opcodes for intrinsics with rounding modes.
19307 // First, we check if the intrinsic may have non-default rounding mode,
19308 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19309 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19310 if (IntrWithRoundingModeOpcode != 0) {
19311 SDValue Rnd = Op.getOperand(5);
19312 if (!isRoundModeCurDirection(Rnd)) {
19313 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19314 dl, Op.getValueType(),
19316 Mask, PassThru, Subtarget, DAG);
19319 // TODO: Intrinsics should have fast-math-flags to propagate.
19320 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19321 Mask, PassThru, Subtarget, DAG);
19323 case INTR_TYPE_2OP_MASK_RM: {
19324 SDValue Src1 = Op.getOperand(1);
19325 SDValue Src2 = Op.getOperand(2);
19326 SDValue PassThru = Op.getOperand(3);
19327 SDValue Mask = Op.getOperand(4);
19328 // We specify 2 possible modes for intrinsics, with/without rounding
19330 // First, we check if the intrinsic have rounding mode (6 operands),
19331 // if not, we set rounding mode to "current".
19333 if (Op.getNumOperands() == 6)
19334 Rnd = Op.getOperand(5);
19336 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19337 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19339 Mask, PassThru, Subtarget, DAG);
19341 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19342 SDValue Src1 = Op.getOperand(1);
19343 SDValue Src2 = Op.getOperand(2);
19344 SDValue Src3 = Op.getOperand(3);
19345 SDValue PassThru = Op.getOperand(4);
19346 SDValue Mask = Op.getOperand(5);
19347 SDValue Sae = Op.getOperand(6);
19349 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19351 Mask, PassThru, Subtarget, DAG);
19353 case INTR_TYPE_3OP_MASK_RM: {
19354 SDValue Src1 = Op.getOperand(1);
19355 SDValue Src2 = Op.getOperand(2);
19356 SDValue Imm = Op.getOperand(3);
19357 SDValue PassThru = Op.getOperand(4);
19358 SDValue Mask = Op.getOperand(5);
19359 // We specify 2 possible modes for intrinsics, with/without rounding
19361 // First, we check if the intrinsic have rounding mode (7 operands),
19362 // if not, we set rounding mode to "current".
19364 if (Op.getNumOperands() == 7)
19365 Rnd = Op.getOperand(6);
19367 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19368 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19369 Src1, Src2, Imm, Rnd),
19370 Mask, PassThru, Subtarget, DAG);
19372 case INTR_TYPE_3OP_IMM8_MASK:
19373 case INTR_TYPE_3OP_MASK: {
19374 SDValue Src1 = Op.getOperand(1);
19375 SDValue Src2 = Op.getOperand(2);
19376 SDValue Src3 = Op.getOperand(3);
19377 SDValue PassThru = Op.getOperand(4);
19378 SDValue Mask = Op.getOperand(5);
19380 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19381 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19383 // We specify 2 possible opcodes for intrinsics with rounding modes.
19384 // First, we check if the intrinsic may have non-default rounding mode,
19385 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19386 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19387 if (IntrWithRoundingModeOpcode != 0) {
19388 SDValue Rnd = Op.getOperand(6);
19389 if (!isRoundModeCurDirection(Rnd)) {
19390 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19391 dl, Op.getValueType(),
19392 Src1, Src2, Src3, Rnd),
19393 Mask, PassThru, Subtarget, DAG);
19396 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19398 Mask, PassThru, Subtarget, DAG);
19400 case VPERM_2OP_MASK : {
19401 SDValue Src1 = Op.getOperand(1);
19402 SDValue Src2 = Op.getOperand(2);
19403 SDValue PassThru = Op.getOperand(3);
19404 SDValue Mask = Op.getOperand(4);
19406 // Swap Src1 and Src2 in the node creation
19407 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19408 Mask, PassThru, Subtarget, DAG);
19410 case VPERM_3OP_MASKZ:
19411 case VPERM_3OP_MASK:{
19412 MVT VT = Op.getSimpleValueType();
19413 // Src2 is the PassThru
19414 SDValue Src1 = Op.getOperand(1);
19415 // PassThru needs to be the same type as the destination in order
19416 // to pattern match correctly.
19417 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19418 SDValue Src3 = Op.getOperand(3);
19419 SDValue Mask = Op.getOperand(4);
19420 SDValue PassThru = SDValue();
19422 // set PassThru element
19423 if (IntrData->Type == VPERM_3OP_MASKZ)
19424 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19428 // Swap Src1 and Src2 in the node creation
19429 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19430 dl, Op.getValueType(),
19432 Mask, PassThru, Subtarget, DAG);
19436 case FMA_OP_MASK: {
19437 SDValue Src1 = Op.getOperand(1);
19438 SDValue Src2 = Op.getOperand(2);
19439 SDValue Src3 = Op.getOperand(3);
19440 SDValue Mask = Op.getOperand(4);
19441 MVT VT = Op.getSimpleValueType();
19442 SDValue PassThru = SDValue();
19444 // set PassThru element
19445 if (IntrData->Type == FMA_OP_MASKZ)
19446 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19447 else if (IntrData->Type == FMA_OP_MASK3)
19452 // We specify 2 possible opcodes for intrinsics with rounding modes.
19453 // First, we check if the intrinsic may have non-default rounding mode,
19454 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19455 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19456 if (IntrWithRoundingModeOpcode != 0) {
19457 SDValue Rnd = Op.getOperand(5);
19458 if (!isRoundModeCurDirection(Rnd))
19459 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19460 dl, Op.getValueType(),
19461 Src1, Src2, Src3, Rnd),
19462 Mask, PassThru, Subtarget, DAG);
19464 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19465 dl, Op.getValueType(),
19467 Mask, PassThru, Subtarget, DAG);
19469 case FMA_OP_SCALAR_MASK:
19470 case FMA_OP_SCALAR_MASK3:
19471 case FMA_OP_SCALAR_MASKZ: {
19472 SDValue Src1 = Op.getOperand(1);
19473 SDValue Src2 = Op.getOperand(2);
19474 SDValue Src3 = Op.getOperand(3);
19475 SDValue Mask = Op.getOperand(4);
19476 MVT VT = Op.getSimpleValueType();
19477 SDValue PassThru = SDValue();
19479 // set PassThru element
19480 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19481 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19482 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19487 SDValue Rnd = Op.getOperand(5);
19488 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19489 Op.getValueType(), Src1, Src2,
19491 Mask, PassThru, Subtarget, DAG);
19493 case TERLOG_OP_MASK:
19494 case TERLOG_OP_MASKZ: {
19495 SDValue Src1 = Op.getOperand(1);
19496 SDValue Src2 = Op.getOperand(2);
19497 SDValue Src3 = Op.getOperand(3);
19498 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19499 SDValue Mask = Op.getOperand(5);
19500 MVT VT = Op.getSimpleValueType();
19501 SDValue PassThru = Src1;
19502 // Set PassThru element.
19503 if (IntrData->Type == TERLOG_OP_MASKZ)
19504 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19506 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19507 Src1, Src2, Src3, Src4),
19508 Mask, PassThru, Subtarget, DAG);
19511 // ISD::FP_ROUND has a second argument that indicates if the truncation
19512 // does not change the value. Set it to 0 since it can change.
19513 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19514 DAG.getIntPtrConstant(0, dl));
19515 case CVTPD2PS_MASK: {
19516 SDValue Src = Op.getOperand(1);
19517 SDValue PassThru = Op.getOperand(2);
19518 SDValue Mask = Op.getOperand(3);
19519 // We add rounding mode to the Node when
19520 // - RM Opcode is specified and
19521 // - RM is not "current direction".
19522 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19523 if (IntrWithRoundingModeOpcode != 0) {
19524 SDValue Rnd = Op.getOperand(4);
19525 if (!isRoundModeCurDirection(Rnd)) {
19526 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19527 dl, Op.getValueType(),
19529 Mask, PassThru, Subtarget, DAG);
19532 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19533 // ISD::FP_ROUND has a second argument that indicates if the truncation
19534 // does not change the value. Set it to 0 since it can change.
19535 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19536 DAG.getIntPtrConstant(0, dl)),
19537 Mask, PassThru, Subtarget, DAG);
19540 // FPclass intrinsics with mask
19541 SDValue Src1 = Op.getOperand(1);
19542 MVT VT = Src1.getSimpleValueType();
19543 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19544 SDValue Imm = Op.getOperand(2);
19545 SDValue Mask = Op.getOperand(3);
19546 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19547 Mask.getSimpleValueType().getSizeInBits());
19548 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19549 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19550 DAG.getTargetConstant(0, dl, MaskVT),
19552 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19553 DAG.getUNDEF(BitcastVT), FPclassMask,
19554 DAG.getIntPtrConstant(0, dl));
19555 return DAG.getBitcast(Op.getValueType(), Res);
19558 SDValue Src1 = Op.getOperand(1);
19559 SDValue Imm = Op.getOperand(2);
19560 SDValue Mask = Op.getOperand(3);
19561 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
19562 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19563 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19564 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
19567 case CMP_MASK_CC: {
19568 // Comparison intrinsics with masks.
19569 // Example of transformation:
19570 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19571 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19573 // (v8i1 (insert_subvector undef,
19574 // (v2i1 (and (PCMPEQM %a, %b),
19575 // (extract_subvector
19576 // (v8i1 (bitcast %mask)), 0))), 0))))
19577 MVT VT = Op.getOperand(1).getSimpleValueType();
19578 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19579 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19580 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19581 Mask.getSimpleValueType().getSizeInBits());
19583 if (IntrData->Type == CMP_MASK_CC) {
19584 SDValue CC = Op.getOperand(3);
19585 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19586 // We specify 2 possible opcodes for intrinsics with rounding modes.
19587 // First, we check if the intrinsic may have non-default rounding mode,
19588 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19589 if (IntrData->Opc1 != 0) {
19590 SDValue Rnd = Op.getOperand(5);
19591 if (!isRoundModeCurDirection(Rnd))
19592 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19593 Op.getOperand(2), CC, Rnd);
19595 //default rounding mode
19597 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19598 Op.getOperand(2), CC);
19601 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19602 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19605 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19606 DAG.getTargetConstant(0, dl,
19609 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19610 DAG.getUNDEF(BitcastVT), CmpMask,
19611 DAG.getIntPtrConstant(0, dl));
19612 return DAG.getBitcast(Op.getValueType(), Res);
19614 case CMP_MASK_SCALAR_CC: {
19615 SDValue Src1 = Op.getOperand(1);
19616 SDValue Src2 = Op.getOperand(2);
19617 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19618 SDValue Mask = Op.getOperand(4);
19621 if (IntrData->Opc1 != 0) {
19622 SDValue Rnd = Op.getOperand(5);
19623 if (!isRoundModeCurDirection(Rnd))
19624 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
19626 //default rounding mode
19628 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
19630 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19631 DAG.getTargetConstant(0, dl,
19635 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
19637 case COMI: { // Comparison intrinsics
19638 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19639 SDValue LHS = Op.getOperand(1);
19640 SDValue RHS = Op.getOperand(2);
19641 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19642 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19645 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19646 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19647 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19648 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19651 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19652 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19653 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19654 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19657 case ISD::SETGT: // (CF = 0 and ZF = 0)
19658 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19660 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19661 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19664 case ISD::SETGE: // CF = 0
19665 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19667 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19668 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19671 llvm_unreachable("Unexpected illegal condition!");
19673 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19675 case COMI_RM: { // Comparison intrinsics with Sae
19676 SDValue LHS = Op.getOperand(1);
19677 SDValue RHS = Op.getOperand(2);
19678 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19679 SDValue Sae = Op.getOperand(4);
19682 if (isRoundModeCurDirection(Sae))
19683 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19684 DAG.getConstant(CondVal, dl, MVT::i8));
19686 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19687 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19688 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19689 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19692 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19693 Op.getOperand(1), Op.getOperand(2), Subtarget,
19695 case COMPRESS_EXPAND_IN_REG: {
19696 SDValue Mask = Op.getOperand(3);
19697 SDValue DataToCompress = Op.getOperand(1);
19698 SDValue PassThru = Op.getOperand(2);
19699 if (isAllOnesConstant(Mask)) // return data as is
19700 return Op.getOperand(1);
19702 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19704 Mask, PassThru, Subtarget, DAG);
19707 SDValue Mask = Op.getOperand(1);
19708 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19709 Mask.getSimpleValueType().getSizeInBits());
19710 Mask = DAG.getBitcast(MaskVT, Mask);
19711 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19714 MVT VT = Op.getSimpleValueType();
19715 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19717 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19718 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19719 // Arguments should be swapped.
19720 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19721 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19723 return DAG.getBitcast(VT, Res);
19726 MVT VT = Op.getSimpleValueType();
19727 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19729 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19730 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19731 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19732 return DAG.getBitcast(VT, Res);
19735 case FIXUPIMMS_MASKZ:
19737 case FIXUPIMM_MASKZ:{
19738 SDValue Src1 = Op.getOperand(1);
19739 SDValue Src2 = Op.getOperand(2);
19740 SDValue Src3 = Op.getOperand(3);
19741 SDValue Imm = Op.getOperand(4);
19742 SDValue Mask = Op.getOperand(5);
19743 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19744 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19745 // We specify 2 possible modes for intrinsics, with/without rounding
19747 // First, we check if the intrinsic have rounding mode (7 operands),
19748 // if not, we set rounding mode to "current".
19750 if (Op.getNumOperands() == 7)
19751 Rnd = Op.getOperand(6);
19753 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19754 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19755 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19756 Src1, Src2, Src3, Imm, Rnd),
19757 Mask, Passthru, Subtarget, DAG);
19758 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19759 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19760 Src1, Src2, Src3, Imm, Rnd),
19761 Mask, Passthru, Subtarget, DAG);
19763 case CONVERT_TO_MASK: {
19764 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19765 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19766 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19768 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19770 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19771 DAG.getUNDEF(BitcastVT), CvtMask,
19772 DAG.getIntPtrConstant(0, dl));
19773 return DAG.getBitcast(Op.getValueType(), Res);
19775 case CONVERT_MASK_TO_VEC: {
19776 SDValue Mask = Op.getOperand(1);
19777 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19778 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19779 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19781 case BRCST_SUBVEC_TO_VEC: {
19782 SDValue Src = Op.getOperand(1);
19783 SDValue Passthru = Op.getOperand(2);
19784 SDValue Mask = Op.getOperand(3);
19785 EVT resVT = Passthru.getValueType();
19786 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19787 DAG.getUNDEF(resVT), Src,
19788 DAG.getIntPtrConstant(0, dl));
19790 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19791 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19793 immVal = DAG.getConstant(0, dl, MVT::i8);
19794 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19795 subVec, subVec, immVal),
19796 Mask, Passthru, Subtarget, DAG);
19798 case BRCST32x2_TO_VEC: {
19799 SDValue Src = Op.getOperand(1);
19800 SDValue PassThru = Op.getOperand(2);
19801 SDValue Mask = Op.getOperand(3);
19803 assert((VT.getScalarType() == MVT::i32 ||
19804 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19805 //bitcast Src to packed 64
19806 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19807 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19808 Src = DAG.getBitcast(BitcastVT, Src);
19810 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19811 Mask, PassThru, Subtarget, DAG);
19819 default: return SDValue(); // Don't custom lower most intrinsics.
19821 case Intrinsic::x86_avx2_permd:
19822 case Intrinsic::x86_avx2_permps:
19823 // Operands intentionally swapped. Mask is last operand to intrinsic,
19824 // but second operand for node/instruction.
19825 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19826 Op.getOperand(2), Op.getOperand(1));
19828 // ptest and testp intrinsics. The intrinsic these come from are designed to
19829 // return an integer value, not just an instruction so lower it to the ptest
19830 // or testp pattern and a setcc for the result.
19831 case Intrinsic::x86_sse41_ptestz:
19832 case Intrinsic::x86_sse41_ptestc:
19833 case Intrinsic::x86_sse41_ptestnzc:
19834 case Intrinsic::x86_avx_ptestz_256:
19835 case Intrinsic::x86_avx_ptestc_256:
19836 case Intrinsic::x86_avx_ptestnzc_256:
19837 case Intrinsic::x86_avx_vtestz_ps:
19838 case Intrinsic::x86_avx_vtestc_ps:
19839 case Intrinsic::x86_avx_vtestnzc_ps:
19840 case Intrinsic::x86_avx_vtestz_pd:
19841 case Intrinsic::x86_avx_vtestc_pd:
19842 case Intrinsic::x86_avx_vtestnzc_pd:
19843 case Intrinsic::x86_avx_vtestz_ps_256:
19844 case Intrinsic::x86_avx_vtestc_ps_256:
19845 case Intrinsic::x86_avx_vtestnzc_ps_256:
19846 case Intrinsic::x86_avx_vtestz_pd_256:
19847 case Intrinsic::x86_avx_vtestc_pd_256:
19848 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19849 bool IsTestPacked = false;
19850 X86::CondCode X86CC;
19852 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19853 case Intrinsic::x86_avx_vtestz_ps:
19854 case Intrinsic::x86_avx_vtestz_pd:
19855 case Intrinsic::x86_avx_vtestz_ps_256:
19856 case Intrinsic::x86_avx_vtestz_pd_256:
19857 IsTestPacked = true;
19859 case Intrinsic::x86_sse41_ptestz:
19860 case Intrinsic::x86_avx_ptestz_256:
19862 X86CC = X86::COND_E;
19864 case Intrinsic::x86_avx_vtestc_ps:
19865 case Intrinsic::x86_avx_vtestc_pd:
19866 case Intrinsic::x86_avx_vtestc_ps_256:
19867 case Intrinsic::x86_avx_vtestc_pd_256:
19868 IsTestPacked = true;
19870 case Intrinsic::x86_sse41_ptestc:
19871 case Intrinsic::x86_avx_ptestc_256:
19873 X86CC = X86::COND_B;
19875 case Intrinsic::x86_avx_vtestnzc_ps:
19876 case Intrinsic::x86_avx_vtestnzc_pd:
19877 case Intrinsic::x86_avx_vtestnzc_ps_256:
19878 case Intrinsic::x86_avx_vtestnzc_pd_256:
19879 IsTestPacked = true;
19881 case Intrinsic::x86_sse41_ptestnzc:
19882 case Intrinsic::x86_avx_ptestnzc_256:
19884 X86CC = X86::COND_A;
19888 SDValue LHS = Op.getOperand(1);
19889 SDValue RHS = Op.getOperand(2);
19890 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19891 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19892 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19893 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19895 case Intrinsic::x86_avx512_kortestz_w:
19896 case Intrinsic::x86_avx512_kortestc_w: {
19897 X86::CondCode X86CC =
19898 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19899 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19900 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19901 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19902 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19903 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19906 case Intrinsic::x86_avx512_knot_w: {
19907 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19908 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19909 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19910 return DAG.getBitcast(MVT::i16, Res);
19913 case Intrinsic::x86_avx512_kandn_w: {
19914 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19915 // Invert LHS for the not.
19916 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19917 DAG.getConstant(1, dl, MVT::v16i1));
19918 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19919 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19920 return DAG.getBitcast(MVT::i16, Res);
19923 case Intrinsic::x86_avx512_kxnor_w: {
19924 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19925 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19926 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19927 // Invert result for the not.
19928 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19929 DAG.getConstant(1, dl, MVT::v16i1));
19930 return DAG.getBitcast(MVT::i16, Res);
19933 case Intrinsic::x86_sse42_pcmpistria128:
19934 case Intrinsic::x86_sse42_pcmpestria128:
19935 case Intrinsic::x86_sse42_pcmpistric128:
19936 case Intrinsic::x86_sse42_pcmpestric128:
19937 case Intrinsic::x86_sse42_pcmpistrio128:
19938 case Intrinsic::x86_sse42_pcmpestrio128:
19939 case Intrinsic::x86_sse42_pcmpistris128:
19940 case Intrinsic::x86_sse42_pcmpestris128:
19941 case Intrinsic::x86_sse42_pcmpistriz128:
19942 case Intrinsic::x86_sse42_pcmpestriz128: {
19944 X86::CondCode X86CC;
19946 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19947 case Intrinsic::x86_sse42_pcmpistria128:
19948 Opcode = X86ISD::PCMPISTRI;
19949 X86CC = X86::COND_A;
19951 case Intrinsic::x86_sse42_pcmpestria128:
19952 Opcode = X86ISD::PCMPESTRI;
19953 X86CC = X86::COND_A;
19955 case Intrinsic::x86_sse42_pcmpistric128:
19956 Opcode = X86ISD::PCMPISTRI;
19957 X86CC = X86::COND_B;
19959 case Intrinsic::x86_sse42_pcmpestric128:
19960 Opcode = X86ISD::PCMPESTRI;
19961 X86CC = X86::COND_B;
19963 case Intrinsic::x86_sse42_pcmpistrio128:
19964 Opcode = X86ISD::PCMPISTRI;
19965 X86CC = X86::COND_O;
19967 case Intrinsic::x86_sse42_pcmpestrio128:
19968 Opcode = X86ISD::PCMPESTRI;
19969 X86CC = X86::COND_O;
19971 case Intrinsic::x86_sse42_pcmpistris128:
19972 Opcode = X86ISD::PCMPISTRI;
19973 X86CC = X86::COND_S;
19975 case Intrinsic::x86_sse42_pcmpestris128:
19976 Opcode = X86ISD::PCMPESTRI;
19977 X86CC = X86::COND_S;
19979 case Intrinsic::x86_sse42_pcmpistriz128:
19980 Opcode = X86ISD::PCMPISTRI;
19981 X86CC = X86::COND_E;
19983 case Intrinsic::x86_sse42_pcmpestriz128:
19984 Opcode = X86ISD::PCMPESTRI;
19985 X86CC = X86::COND_E;
19988 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19989 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19990 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19991 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19992 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19995 case Intrinsic::x86_sse42_pcmpistri128:
19996 case Intrinsic::x86_sse42_pcmpestri128: {
19998 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19999 Opcode = X86ISD::PCMPISTRI;
20001 Opcode = X86ISD::PCMPESTRI;
20003 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20004 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20005 return DAG.getNode(Opcode, dl, VTs, NewOps);
20008 case Intrinsic::eh_sjlj_lsda: {
20009 MachineFunction &MF = DAG.getMachineFunction();
20010 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20011 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20012 auto &Context = MF.getMMI().getContext();
20013 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20014 Twine(MF.getFunctionNumber()));
20015 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20018 case Intrinsic::x86_seh_lsda: {
20019 // Compute the symbol for the LSDA. We know it'll get emitted later.
20020 MachineFunction &MF = DAG.getMachineFunction();
20021 SDValue Op1 = Op.getOperand(1);
20022 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20023 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20024 GlobalValue::getRealLinkageName(Fn->getName()));
20026 // Generate a simple absolute symbol reference. This intrinsic is only
20027 // supported on 32-bit Windows, which isn't PIC.
20028 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20029 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20032 case Intrinsic::x86_seh_recoverfp: {
20033 SDValue FnOp = Op.getOperand(1);
20034 SDValue IncomingFPOp = Op.getOperand(2);
20035 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20036 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20038 report_fatal_error(
20039 "llvm.x86.seh.recoverfp must take a function as the first argument");
20040 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20043 case Intrinsic::localaddress: {
20044 // Returns one of the stack, base, or frame pointer registers, depending on
20045 // which is used to reference local variables.
20046 MachineFunction &MF = DAG.getMachineFunction();
20047 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20049 if (RegInfo->hasBasePointer(MF))
20050 Reg = RegInfo->getBaseRegister();
20051 else // This function handles the SP or FP case.
20052 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20053 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20058 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20059 SDValue Src, SDValue Mask, SDValue Base,
20060 SDValue Index, SDValue ScaleOp, SDValue Chain,
20061 const X86Subtarget &Subtarget) {
20063 auto *C = cast<ConstantSDNode>(ScaleOp);
20064 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20065 EVT MaskVT = Mask.getValueType();
20066 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20067 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20068 SDValue Segment = DAG.getRegister(0, MVT::i32);
20069 // If source is undef or we know it won't be used, use a zero vector
20070 // to break register dependency.
20071 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20072 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20073 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20074 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20075 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20076 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20077 return DAG.getMergeValues(RetOps, dl);
20080 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20081 SDValue Src, SDValue Mask, SDValue Base,
20082 SDValue Index, SDValue ScaleOp, SDValue Chain,
20083 const X86Subtarget &Subtarget) {
20085 auto *C = cast<ConstantSDNode>(ScaleOp);
20086 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20087 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20088 Index.getSimpleValueType().getVectorNumElements());
20090 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20091 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20092 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20093 SDValue Segment = DAG.getRegister(0, MVT::i32);
20094 // If source is undef or we know it won't be used, use a zero vector
20095 // to break register dependency.
20096 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20097 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20098 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20099 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20100 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20101 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20102 return DAG.getMergeValues(RetOps, dl);
20105 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20106 SDValue Src, SDValue Mask, SDValue Base,
20107 SDValue Index, SDValue ScaleOp, SDValue Chain,
20108 const X86Subtarget &Subtarget) {
20110 auto *C = cast<ConstantSDNode>(ScaleOp);
20111 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20112 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20113 SDValue Segment = DAG.getRegister(0, MVT::i32);
20114 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20115 Index.getSimpleValueType().getVectorNumElements());
20117 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20118 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20119 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20120 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20121 return SDValue(Res, 1);
20124 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20125 SDValue Mask, SDValue Base, SDValue Index,
20126 SDValue ScaleOp, SDValue Chain,
20127 const X86Subtarget &Subtarget) {
20129 auto *C = cast<ConstantSDNode>(ScaleOp);
20130 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20131 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20132 SDValue Segment = DAG.getRegister(0, MVT::i32);
20134 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20135 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20136 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20137 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20138 return SDValue(Res, 0);
20141 /// Handles the lowering of builtin intrinsic that return the value
20142 /// of the extended control register.
20143 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20145 const X86Subtarget &Subtarget,
20146 SmallVectorImpl<SDValue> &Results) {
20147 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20148 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20151 // The ECX register is used to select the index of the XCR register to
20154 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20155 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20156 Chain = SDValue(N1, 0);
20158 // Reads the content of XCR and returns it in registers EDX:EAX.
20159 if (Subtarget.is64Bit()) {
20160 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20161 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20164 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20165 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20168 Chain = HI.getValue(1);
20170 if (Subtarget.is64Bit()) {
20171 // Merge the two 32-bit values into a 64-bit one..
20172 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20173 DAG.getConstant(32, DL, MVT::i8));
20174 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20175 Results.push_back(Chain);
20179 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20180 SDValue Ops[] = { LO, HI };
20181 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20182 Results.push_back(Pair);
20183 Results.push_back(Chain);
20186 /// Handles the lowering of builtin intrinsics that read performance monitor
20187 /// counters (x86_rdpmc).
20188 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20190 const X86Subtarget &Subtarget,
20191 SmallVectorImpl<SDValue> &Results) {
20192 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20193 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20196 // The ECX register is used to select the index of the performance counter
20198 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20200 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20202 // Reads the content of a 64-bit performance counter and returns it in the
20203 // registers EDX:EAX.
20204 if (Subtarget.is64Bit()) {
20205 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20206 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20209 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20210 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20213 Chain = HI.getValue(1);
20215 if (Subtarget.is64Bit()) {
20216 // The EAX register is loaded with the low-order 32 bits. The EDX register
20217 // is loaded with the supported high-order bits of the counter.
20218 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20219 DAG.getConstant(32, DL, MVT::i8));
20220 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20221 Results.push_back(Chain);
20225 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20226 SDValue Ops[] = { LO, HI };
20227 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20228 Results.push_back(Pair);
20229 Results.push_back(Chain);
20232 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20233 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20234 /// READCYCLECOUNTER nodes.
20235 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20237 const X86Subtarget &Subtarget,
20238 SmallVectorImpl<SDValue> &Results) {
20239 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20240 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20243 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20244 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20245 // and the EAX register is loaded with the low-order 32 bits.
20246 if (Subtarget.is64Bit()) {
20247 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20248 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20251 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20252 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20255 SDValue Chain = HI.getValue(1);
20257 if (Opcode == X86ISD::RDTSCP_DAG) {
20258 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20260 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20261 // the ECX register. Add 'ecx' explicitly to the chain.
20262 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20264 // Explicitly store the content of ECX at the location passed in input
20265 // to the 'rdtscp' intrinsic.
20266 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20267 MachinePointerInfo());
20270 if (Subtarget.is64Bit()) {
20271 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20272 // the EAX register is loaded with the low-order 32 bits.
20273 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20274 DAG.getConstant(32, DL, MVT::i8));
20275 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20276 Results.push_back(Chain);
20280 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20281 SDValue Ops[] = { LO, HI };
20282 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20283 Results.push_back(Pair);
20284 Results.push_back(Chain);
20287 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20288 SelectionDAG &DAG) {
20289 SmallVector<SDValue, 2> Results;
20291 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20293 return DAG.getMergeValues(Results, DL);
20296 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20297 MachineFunction &MF = DAG.getMachineFunction();
20298 SDValue Chain = Op.getOperand(0);
20299 SDValue RegNode = Op.getOperand(2);
20300 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20302 report_fatal_error("EH registrations only live in functions using WinEH");
20304 // Cast the operand to an alloca, and remember the frame index.
20305 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20307 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20308 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20310 // Return the chain operand without making any DAG nodes.
20314 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20315 MachineFunction &MF = DAG.getMachineFunction();
20316 SDValue Chain = Op.getOperand(0);
20317 SDValue EHGuard = Op.getOperand(2);
20318 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20320 report_fatal_error("EHGuard only live in functions using WinEH");
20322 // Cast the operand to an alloca, and remember the frame index.
20323 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20325 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20326 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20328 // Return the chain operand without making any DAG nodes.
20332 /// Emit Truncating Store with signed or unsigned saturation.
20334 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20335 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20336 SelectionDAG &DAG) {
20338 SDVTList VTs = DAG.getVTList(MVT::Other);
20339 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20340 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20342 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20343 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20346 /// Emit Masked Truncating Store with signed or unsigned saturation.
20348 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20349 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20350 MachineMemOperand *MMO, SelectionDAG &DAG) {
20352 SDVTList VTs = DAG.getVTList(MVT::Other);
20353 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20355 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20356 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20359 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20360 SelectionDAG &DAG) {
20361 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20363 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
20365 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
20366 return MarkEHRegistrationNode(Op, DAG);
20367 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
20368 return MarkEHGuard(Op, DAG);
20369 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
20370 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
20371 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
20372 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
20373 // We need a frame pointer because this will get lowered to a PUSH/POP
20375 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20376 MFI.setHasCopyImplyingStackAdjustment(true);
20377 // Don't do anything here, we will expand these intrinsics out later
20378 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20385 switch(IntrData->Type) {
20386 default: llvm_unreachable("Unknown Intrinsic Type");
20389 // Emit the node with the right value type.
20390 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20391 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20393 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20394 // Otherwise return the value from Rand, which is always 0, casted to i32.
20395 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20396 DAG.getConstant(1, dl, Op->getValueType(1)),
20397 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20398 SDValue(Result.getNode(), 1) };
20399 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20400 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20403 // Return { result, isValid, chain }.
20404 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20405 SDValue(Result.getNode(), 2));
20407 case GATHER_AVX2: {
20408 SDValue Chain = Op.getOperand(0);
20409 SDValue Src = Op.getOperand(2);
20410 SDValue Base = Op.getOperand(3);
20411 SDValue Index = Op.getOperand(4);
20412 SDValue Mask = Op.getOperand(5);
20413 SDValue Scale = Op.getOperand(6);
20414 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20415 Scale, Chain, Subtarget);
20418 //gather(v1, mask, index, base, scale);
20419 SDValue Chain = Op.getOperand(0);
20420 SDValue Src = Op.getOperand(2);
20421 SDValue Base = Op.getOperand(3);
20422 SDValue Index = Op.getOperand(4);
20423 SDValue Mask = Op.getOperand(5);
20424 SDValue Scale = Op.getOperand(6);
20425 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20429 //scatter(base, mask, index, v1, scale);
20430 SDValue Chain = Op.getOperand(0);
20431 SDValue Base = Op.getOperand(2);
20432 SDValue Mask = Op.getOperand(3);
20433 SDValue Index = Op.getOperand(4);
20434 SDValue Src = Op.getOperand(5);
20435 SDValue Scale = Op.getOperand(6);
20436 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20437 Scale, Chain, Subtarget);
20440 SDValue Hint = Op.getOperand(6);
20441 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20442 assert((HintVal == 2 || HintVal == 3) &&
20443 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20444 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20445 SDValue Chain = Op.getOperand(0);
20446 SDValue Mask = Op.getOperand(2);
20447 SDValue Index = Op.getOperand(3);
20448 SDValue Base = Op.getOperand(4);
20449 SDValue Scale = Op.getOperand(5);
20450 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20453 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20455 SmallVector<SDValue, 2> Results;
20456 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20458 return DAG.getMergeValues(Results, dl);
20460 // Read Performance Monitoring Counters.
20462 SmallVector<SDValue, 2> Results;
20463 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20464 return DAG.getMergeValues(Results, dl);
20466 // Get Extended Control Register.
20468 SmallVector<SDValue, 2> Results;
20469 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20470 return DAG.getMergeValues(Results, dl);
20472 // XTEST intrinsics.
20474 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20475 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20477 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20478 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20479 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20480 Ret, SDValue(InTrans.getNode(), 1));
20484 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20485 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20486 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20487 DAG.getConstant(-1, dl, MVT::i8));
20488 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20489 Op.getOperand(4), GenCF.getValue(1));
20490 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20491 Op.getOperand(5), MachinePointerInfo());
20492 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20493 SDValue Results[] = { SetCC, Store };
20494 return DAG.getMergeValues(Results, dl);
20496 case COMPRESS_TO_MEM: {
20497 SDValue Mask = Op.getOperand(4);
20498 SDValue DataToCompress = Op.getOperand(3);
20499 SDValue Addr = Op.getOperand(2);
20500 SDValue Chain = Op.getOperand(0);
20501 MVT VT = DataToCompress.getSimpleValueType();
20503 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20504 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20506 if (isAllOnesConstant(Mask)) // return just a store
20507 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20508 MemIntr->getMemOperand());
20510 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20511 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20513 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20514 MemIntr->getMemOperand(),
20515 false /* truncating */, true /* compressing */);
20517 case TRUNCATE_TO_MEM_VI8:
20518 case TRUNCATE_TO_MEM_VI16:
20519 case TRUNCATE_TO_MEM_VI32: {
20520 SDValue Mask = Op.getOperand(4);
20521 SDValue DataToTruncate = Op.getOperand(3);
20522 SDValue Addr = Op.getOperand(2);
20523 SDValue Chain = Op.getOperand(0);
20525 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20526 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20528 EVT MemVT = MemIntr->getMemoryVT();
20530 uint16_t TruncationOp = IntrData->Opc0;
20531 switch (TruncationOp) {
20532 case X86ISD::VTRUNC: {
20533 if (isAllOnesConstant(Mask)) // return just a truncate store
20534 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20535 MemIntr->getMemOperand());
20537 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20538 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20540 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20541 MemIntr->getMemOperand(), true /* truncating */);
20543 case X86ISD::VTRUNCUS:
20544 case X86ISD::VTRUNCS: {
20545 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20546 if (isAllOnesConstant(Mask))
20547 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20548 MemIntr->getMemOperand(), DAG);
20550 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20551 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20553 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20554 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20557 llvm_unreachable("Unsupported truncstore intrinsic");
20561 case EXPAND_FROM_MEM: {
20562 SDValue Mask = Op.getOperand(4);
20563 SDValue PassThru = Op.getOperand(3);
20564 SDValue Addr = Op.getOperand(2);
20565 SDValue Chain = Op.getOperand(0);
20566 MVT VT = Op.getSimpleValueType();
20568 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20569 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20571 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20572 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20573 if (X86::isZeroNode(Mask))
20574 return DAG.getUNDEF(VT);
20576 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20577 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20578 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20579 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20580 true /* expanding */);
20585 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20586 SelectionDAG &DAG) const {
20587 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20588 MFI.setReturnAddressIsTaken(true);
20590 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20593 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20595 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20598 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20599 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20600 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20601 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20602 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20603 MachinePointerInfo());
20606 // Just load the return address.
20607 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20608 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20609 MachinePointerInfo());
20612 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20613 SelectionDAG &DAG) const {
20614 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20615 return getReturnAddressFrameIndex(DAG);
20618 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20619 MachineFunction &MF = DAG.getMachineFunction();
20620 MachineFrameInfo &MFI = MF.getFrameInfo();
20621 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20622 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20623 EVT VT = Op.getValueType();
20625 MFI.setFrameAddressIsTaken(true);
20627 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20628 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20629 // is not possible to crawl up the stack without looking at the unwind codes
20631 int FrameAddrIndex = FuncInfo->getFAIndex();
20632 if (!FrameAddrIndex) {
20633 // Set up a frame object for the return address.
20634 unsigned SlotSize = RegInfo->getSlotSize();
20635 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20636 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20637 FuncInfo->setFAIndex(FrameAddrIndex);
20639 return DAG.getFrameIndex(FrameAddrIndex, VT);
20642 unsigned FrameReg =
20643 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20644 SDLoc dl(Op); // FIXME probably not meaningful
20645 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20646 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20647 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20648 "Invalid Frame Register!");
20649 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20651 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20652 MachinePointerInfo());
20656 // FIXME? Maybe this could be a TableGen attribute on some registers and
20657 // this table could be generated automatically from RegInfo.
20658 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20659 SelectionDAG &DAG) const {
20660 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20661 const MachineFunction &MF = DAG.getMachineFunction();
20663 unsigned Reg = StringSwitch<unsigned>(RegName)
20664 .Case("esp", X86::ESP)
20665 .Case("rsp", X86::RSP)
20666 .Case("ebp", X86::EBP)
20667 .Case("rbp", X86::RBP)
20670 if (Reg == X86::EBP || Reg == X86::RBP) {
20671 if (!TFI.hasFP(MF))
20672 report_fatal_error("register " + StringRef(RegName) +
20673 " is allocatable: function has no frame pointer");
20676 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20677 unsigned FrameReg =
20678 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20679 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20680 "Invalid Frame Register!");
20688 report_fatal_error("Invalid register name global variable");
20691 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20692 SelectionDAG &DAG) const {
20693 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20694 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20697 unsigned X86TargetLowering::getExceptionPointerRegister(
20698 const Constant *PersonalityFn) const {
20699 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20700 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20702 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20705 unsigned X86TargetLowering::getExceptionSelectorRegister(
20706 const Constant *PersonalityFn) const {
20707 // Funclet personalities don't use selectors (the runtime does the selection).
20708 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20709 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20712 bool X86TargetLowering::needsFixedCatchObjects() const {
20713 return Subtarget.isTargetWin64();
20716 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20717 SDValue Chain = Op.getOperand(0);
20718 SDValue Offset = Op.getOperand(1);
20719 SDValue Handler = Op.getOperand(2);
20722 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20723 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20724 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20725 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20726 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20727 "Invalid Frame Register!");
20728 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20729 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20731 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20732 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20734 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20735 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20736 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20738 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20739 DAG.getRegister(StoreAddrReg, PtrVT));
20742 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20743 SelectionDAG &DAG) const {
20745 // If the subtarget is not 64bit, we may need the global base reg
20746 // after isel expand pseudo, i.e., after CGBR pass ran.
20747 // Therefore, ask for the GlobalBaseReg now, so that the pass
20748 // inserts the code for us in case we need it.
20749 // Otherwise, we will end up in a situation where we will
20750 // reference a virtual register that is not defined!
20751 if (!Subtarget.is64Bit()) {
20752 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20753 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20755 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20756 DAG.getVTList(MVT::i32, MVT::Other),
20757 Op.getOperand(0), Op.getOperand(1));
20760 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20761 SelectionDAG &DAG) const {
20763 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20764 Op.getOperand(0), Op.getOperand(1));
20767 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20768 SelectionDAG &DAG) const {
20770 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20774 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20775 return Op.getOperand(0);
20778 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20779 SelectionDAG &DAG) const {
20780 SDValue Root = Op.getOperand(0);
20781 SDValue Trmp = Op.getOperand(1); // trampoline
20782 SDValue FPtr = Op.getOperand(2); // nested function
20783 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20786 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20787 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20789 if (Subtarget.is64Bit()) {
20790 SDValue OutChains[6];
20792 // Large code-model.
20793 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20794 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20796 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20797 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20799 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20801 // Load the pointer to the nested function into R11.
20802 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20803 SDValue Addr = Trmp;
20804 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20805 Addr, MachinePointerInfo(TrmpAddr));
20807 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20808 DAG.getConstant(2, dl, MVT::i64));
20810 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20811 /* Alignment = */ 2);
20813 // Load the 'nest' parameter value into R10.
20814 // R10 is specified in X86CallingConv.td
20815 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20816 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20817 DAG.getConstant(10, dl, MVT::i64));
20818 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20819 Addr, MachinePointerInfo(TrmpAddr, 10));
20821 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20822 DAG.getConstant(12, dl, MVT::i64));
20824 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20825 /* Alignment = */ 2);
20827 // Jump to the nested function.
20828 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20829 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20830 DAG.getConstant(20, dl, MVT::i64));
20831 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20832 Addr, MachinePointerInfo(TrmpAddr, 20));
20834 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20835 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20836 DAG.getConstant(22, dl, MVT::i64));
20837 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20838 Addr, MachinePointerInfo(TrmpAddr, 22));
20840 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20842 const Function *Func =
20843 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20844 CallingConv::ID CC = Func->getCallingConv();
20849 llvm_unreachable("Unsupported calling convention");
20850 case CallingConv::C:
20851 case CallingConv::X86_StdCall: {
20852 // Pass 'nest' parameter in ECX.
20853 // Must be kept in sync with X86CallingConv.td
20854 NestReg = X86::ECX;
20856 // Check that ECX wasn't needed by an 'inreg' parameter.
20857 FunctionType *FTy = Func->getFunctionType();
20858 const AttributeList &Attrs = Func->getAttributes();
20860 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20861 unsigned InRegCount = 0;
20864 for (FunctionType::param_iterator I = FTy->param_begin(),
20865 E = FTy->param_end(); I != E; ++I, ++Idx)
20866 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20867 auto &DL = DAG.getDataLayout();
20868 // FIXME: should only count parameters that are lowered to integers.
20869 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20872 if (InRegCount > 2) {
20873 report_fatal_error("Nest register in use - reduce number of inreg"
20879 case CallingConv::X86_FastCall:
20880 case CallingConv::X86_ThisCall:
20881 case CallingConv::Fast:
20882 // Pass 'nest' parameter in EAX.
20883 // Must be kept in sync with X86CallingConv.td
20884 NestReg = X86::EAX;
20888 SDValue OutChains[4];
20889 SDValue Addr, Disp;
20891 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20892 DAG.getConstant(10, dl, MVT::i32));
20893 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20895 // This is storing the opcode for MOV32ri.
20896 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20897 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20899 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20900 Trmp, MachinePointerInfo(TrmpAddr));
20902 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20903 DAG.getConstant(1, dl, MVT::i32));
20905 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20906 /* Alignment = */ 1);
20908 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20909 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20910 DAG.getConstant(5, dl, MVT::i32));
20911 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20912 Addr, MachinePointerInfo(TrmpAddr, 5),
20913 /* Alignment = */ 1);
20915 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20916 DAG.getConstant(6, dl, MVT::i32));
20918 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20919 /* Alignment = */ 1);
20921 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20925 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20926 SelectionDAG &DAG) const {
20928 The rounding mode is in bits 11:10 of FPSR, and has the following
20930 00 Round to nearest
20935 FLT_ROUNDS, on the other hand, expects the following:
20942 To perform the conversion, we do:
20943 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20946 MachineFunction &MF = DAG.getMachineFunction();
20947 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20948 unsigned StackAlignment = TFI.getStackAlignment();
20949 MVT VT = Op.getSimpleValueType();
20952 // Save FP Control Word to stack slot
20953 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20954 SDValue StackSlot =
20955 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20957 MachineMemOperand *MMO =
20958 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20959 MachineMemOperand::MOStore, 2, 2);
20961 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20962 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20963 DAG.getVTList(MVT::Other),
20964 Ops, MVT::i16, MMO);
20966 // Load FP Control Word from stack slot
20968 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20970 // Transform as necessary
20972 DAG.getNode(ISD::SRL, DL, MVT::i16,
20973 DAG.getNode(ISD::AND, DL, MVT::i16,
20974 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20975 DAG.getConstant(11, DL, MVT::i8));
20977 DAG.getNode(ISD::SRL, DL, MVT::i16,
20978 DAG.getNode(ISD::AND, DL, MVT::i16,
20979 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20980 DAG.getConstant(9, DL, MVT::i8));
20983 DAG.getNode(ISD::AND, DL, MVT::i16,
20984 DAG.getNode(ISD::ADD, DL, MVT::i16,
20985 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20986 DAG.getConstant(1, DL, MVT::i16)),
20987 DAG.getConstant(3, DL, MVT::i16));
20989 return DAG.getNode((VT.getSizeInBits() < 16 ?
20990 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20993 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20995 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20996 // to 512-bit vector.
20997 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20998 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20999 // split the vector, perform operation on it's Lo a Hi part and
21000 // concatenate the results.
21001 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
21002 assert(Op.getOpcode() == ISD::CTLZ);
21004 MVT VT = Op.getSimpleValueType();
21005 MVT EltVT = VT.getVectorElementType();
21006 unsigned NumElems = VT.getVectorNumElements();
21008 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
21009 // Extend to 512 bit vector.
21010 assert((VT.is256BitVector() || VT.is128BitVector()) &&
21011 "Unsupported value type for operation");
21013 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
21014 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
21015 DAG.getUNDEF(NewVT),
21017 DAG.getIntPtrConstant(0, dl));
21018 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
21020 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
21021 DAG.getIntPtrConstant(0, dl));
21024 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21025 "Unsupported element type");
21027 if (16 < NumElems) {
21028 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21030 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
21031 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
21033 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
21034 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
21036 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
21039 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21041 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21042 "Unsupported value type for operation");
21044 // Use native supported vector instruction vplzcntd.
21045 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21046 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21047 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21048 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21050 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21053 // Lower CTLZ using a PSHUFB lookup table implementation.
21054 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21055 const X86Subtarget &Subtarget,
21056 SelectionDAG &DAG) {
21057 MVT VT = Op.getSimpleValueType();
21058 int NumElts = VT.getVectorNumElements();
21059 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21060 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21062 // Per-nibble leading zero PSHUFB lookup table.
21063 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21064 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21065 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21066 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21068 SmallVector<SDValue, 64> LUTVec;
21069 for (int i = 0; i < NumBytes; ++i)
21070 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21071 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21073 // Begin by bitcasting the input to byte vector, then split those bytes
21074 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21075 // If the hi input nibble is zero then we add both results together, otherwise
21076 // we just take the hi result (by masking the lo result to zero before the
21078 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21079 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21081 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21082 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21083 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21084 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21085 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21087 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21088 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21089 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21090 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21092 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21093 // of the current vector width in the same way we did for the nibbles.
21094 // If the upper half of the input element is zero then add the halves'
21095 // leading zero counts together, otherwise just use the upper half's.
21096 // Double the width of the result until we are at target width.
21097 while (CurrVT != VT) {
21098 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21099 int CurrNumElts = CurrVT.getVectorNumElements();
21100 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21101 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21102 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21104 // Check if the upper half of the input element is zero.
21105 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21106 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21107 HiZ = DAG.getBitcast(NextVT, HiZ);
21109 // Move the upper/lower halves to the lower bits as we'll be extending to
21110 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21112 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21113 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21114 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21115 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21116 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21123 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21124 const X86Subtarget &Subtarget,
21125 SelectionDAG &DAG) {
21126 MVT VT = Op.getSimpleValueType();
21127 SDValue Op0 = Op.getOperand(0);
21129 if (Subtarget.hasAVX512())
21130 return LowerVectorCTLZ_AVX512(Op, DAG);
21132 // Decompose 256-bit ops into smaller 128-bit ops.
21133 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21134 unsigned NumElems = VT.getVectorNumElements();
21136 // Extract each 128-bit vector, perform ctlz and concat the result.
21137 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21138 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21140 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21141 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
21142 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
21145 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21146 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21149 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21150 SelectionDAG &DAG) {
21151 MVT VT = Op.getSimpleValueType();
21153 unsigned NumBits = VT.getSizeInBits();
21155 unsigned Opc = Op.getOpcode();
21158 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21160 Op = Op.getOperand(0);
21161 if (VT == MVT::i8) {
21162 // Zero extend to i32 since there is not an i8 bsr.
21164 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21167 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21168 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21169 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21171 if (Opc == ISD::CTLZ) {
21172 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21175 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21176 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21179 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21182 // Finally xor with NumBits-1.
21183 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21184 DAG.getConstant(NumBits - 1, dl, OpVT));
21187 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21191 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21192 MVT VT = Op.getSimpleValueType();
21193 unsigned NumBits = VT.getScalarSizeInBits();
21196 if (VT.isVector()) {
21197 SDValue N0 = Op.getOperand(0);
21198 SDValue Zero = DAG.getConstant(0, dl, VT);
21200 // lsb(x) = (x & -x)
21201 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21202 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21204 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21205 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21206 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21207 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21208 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21211 // cttz(x) = ctpop(lsb - 1)
21212 SDValue One = DAG.getConstant(1, dl, VT);
21213 return DAG.getNode(ISD::CTPOP, dl, VT,
21214 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21217 assert(Op.getOpcode() == ISD::CTTZ &&
21218 "Only scalar CTTZ requires custom lowering");
21220 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21221 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21222 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21224 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21227 DAG.getConstant(NumBits, dl, VT),
21228 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21231 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21234 /// Break a 256-bit integer operation into two new 128-bit ones and then
21235 /// concatenate the result back.
21236 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21237 MVT VT = Op.getSimpleValueType();
21239 assert(VT.is256BitVector() && VT.isInteger() &&
21240 "Unsupported value type for operation");
21242 unsigned NumElems = VT.getVectorNumElements();
21245 // Extract the LHS vectors
21246 SDValue LHS = Op.getOperand(0);
21247 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21248 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21250 // Extract the RHS vectors
21251 SDValue RHS = Op.getOperand(1);
21252 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21253 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21255 MVT EltVT = VT.getVectorElementType();
21256 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21258 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21259 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21260 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21263 /// Break a 512-bit integer operation into two new 256-bit ones and then
21264 /// concatenate the result back.
21265 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21266 MVT VT = Op.getSimpleValueType();
21268 assert(VT.is512BitVector() && VT.isInteger() &&
21269 "Unsupported value type for operation");
21271 unsigned NumElems = VT.getVectorNumElements();
21274 // Extract the LHS vectors
21275 SDValue LHS = Op.getOperand(0);
21276 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21277 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21279 // Extract the RHS vectors
21280 SDValue RHS = Op.getOperand(1);
21281 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21282 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21284 MVT EltVT = VT.getVectorElementType();
21285 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21287 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21288 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21289 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21292 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21293 MVT VT = Op.getSimpleValueType();
21294 if (VT.getScalarType() == MVT::i1)
21295 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21296 Op.getOperand(0), Op.getOperand(1));
21297 assert(Op.getSimpleValueType().is256BitVector() &&
21298 Op.getSimpleValueType().isInteger() &&
21299 "Only handle AVX 256-bit vector integer operation");
21300 return Lower256IntArith(Op, DAG);
21303 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21304 assert(Op.getSimpleValueType().is256BitVector() &&
21305 Op.getSimpleValueType().isInteger() &&
21306 "Only handle AVX 256-bit vector integer operation");
21307 MVT VT = Op.getSimpleValueType();
21308 unsigned NumElems = VT.getVectorNumElements();
21311 SDValue Src = Op.getOperand(0);
21312 SDValue Lo = extract128BitVector(Src, 0, DAG, dl);
21313 SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl);
21315 MVT EltVT = VT.getVectorElementType();
21316 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21317 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21318 DAG.getNode(ISD::ABS, dl, NewVT, Lo),
21319 DAG.getNode(ISD::ABS, dl, NewVT, Hi));
21322 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21323 assert(Op.getSimpleValueType().is256BitVector() &&
21324 Op.getSimpleValueType().isInteger() &&
21325 "Only handle AVX 256-bit vector integer operation");
21326 return Lower256IntArith(Op, DAG);
21329 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21330 SelectionDAG &DAG) {
21332 MVT VT = Op.getSimpleValueType();
21334 if (VT.getScalarType() == MVT::i1)
21335 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21337 // Decompose 256-bit ops into smaller 128-bit ops.
21338 if (VT.is256BitVector() && !Subtarget.hasInt256())
21339 return Lower256IntArith(Op, DAG);
21341 SDValue A = Op.getOperand(0);
21342 SDValue B = Op.getOperand(1);
21344 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21345 // vector pairs, multiply and truncate.
21346 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21347 if (Subtarget.hasInt256()) {
21348 // For 512-bit vectors, split into 256-bit vectors to allow the
21349 // sign-extension to occur.
21350 if (VT == MVT::v64i8)
21351 return Lower512IntArith(Op, DAG);
21353 // For 256-bit vectors, split into 128-bit vectors to allow the
21354 // sign-extension to occur. We don't need this on AVX512BW as we can
21355 // safely sign-extend to v32i16.
21356 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21357 return Lower256IntArith(Op, DAG);
21359 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21360 return DAG.getNode(
21361 ISD::TRUNCATE, dl, VT,
21362 DAG.getNode(ISD::MUL, dl, ExVT,
21363 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21364 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21367 assert(VT == MVT::v16i8 &&
21368 "Pre-AVX2 support only supports v16i8 multiplication");
21369 MVT ExVT = MVT::v8i16;
21371 // Extract the lo parts and sign extend to i16
21373 if (Subtarget.hasSSE41()) {
21374 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21375 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21377 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21378 -1, 4, -1, 5, -1, 6, -1, 7};
21379 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21380 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21381 ALo = DAG.getBitcast(ExVT, ALo);
21382 BLo = DAG.getBitcast(ExVT, BLo);
21383 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21384 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21387 // Extract the hi parts and sign extend to i16
21389 if (Subtarget.hasSSE41()) {
21390 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21391 -1, -1, -1, -1, -1, -1, -1, -1};
21392 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21393 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21394 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21395 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21397 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21398 -1, 12, -1, 13, -1, 14, -1, 15};
21399 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21400 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21401 AHi = DAG.getBitcast(ExVT, AHi);
21402 BHi = DAG.getBitcast(ExVT, BHi);
21403 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21404 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21407 // Multiply, mask the lower 8bits of the lo/hi results and pack
21408 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21409 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21410 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21411 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21412 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21415 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21416 if (VT == MVT::v4i32) {
21417 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21418 "Should not custom lower when pmuldq is available!");
21420 // Extract the odd parts.
21421 static const int UnpackMask[] = { 1, -1, 3, -1 };
21422 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21423 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21425 // Multiply the even parts.
21426 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21427 // Now multiply odd parts.
21428 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21430 Evens = DAG.getBitcast(VT, Evens);
21431 Odds = DAG.getBitcast(VT, Odds);
21433 // Merge the two vectors back together with a shuffle. This expands into 2
21435 static const int ShufMask[] = { 0, 4, 2, 6 };
21436 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21439 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21440 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21442 // 32-bit vector types used for MULDQ/MULUDQ.
21443 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21445 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21446 // 32-bits. We can lower with this if the sign bits stretch that far.
21447 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21448 DAG.ComputeNumSignBits(B) > 32) {
21449 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21450 DAG.getBitcast(MulVT, B));
21453 // Ahi = psrlqi(a, 32);
21454 // Bhi = psrlqi(b, 32);
21456 // AloBlo = pmuludq(a, b);
21457 // AloBhi = pmuludq(a, Bhi);
21458 // AhiBlo = pmuludq(Ahi, b);
21460 // Hi = psllqi(AloBhi + AhiBlo, 32);
21461 // return AloBlo + Hi;
21462 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21463 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21464 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21466 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21467 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21468 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21470 // Bit cast to 32-bit vectors for MULUDQ.
21471 SDValue Alo = DAG.getBitcast(MulVT, A);
21472 SDValue Blo = DAG.getBitcast(MulVT, B);
21474 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21476 // Only multiply lo/hi halves that aren't known to be zero.
21477 SDValue AloBlo = Zero;
21478 if (!ALoIsZero && !BLoIsZero)
21479 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21481 SDValue AloBhi = Zero;
21482 if (!ALoIsZero && !BHiIsZero) {
21483 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21484 Bhi = DAG.getBitcast(MulVT, Bhi);
21485 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21488 SDValue AhiBlo = Zero;
21489 if (!AHiIsZero && !BLoIsZero) {
21490 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21491 Ahi = DAG.getBitcast(MulVT, Ahi);
21492 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21495 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21496 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21498 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21501 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21502 SelectionDAG &DAG) {
21504 MVT VT = Op.getSimpleValueType();
21506 // Decompose 256-bit ops into smaller 128-bit ops.
21507 if (VT.is256BitVector() && !Subtarget.hasInt256())
21508 return Lower256IntArith(Op, DAG);
21510 // Only i8 vectors should need custom lowering after this.
21511 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21512 "Unsupported vector type");
21514 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21515 // logical shift down the upper half and pack back to i8.
21516 SDValue A = Op.getOperand(0);
21517 SDValue B = Op.getOperand(1);
21519 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21520 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21521 unsigned Opcode = Op.getOpcode();
21522 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21523 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21525 // AVX2 implementations - extend xmm subvectors to ymm.
21526 if (Subtarget.hasInt256()) {
21527 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21528 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21530 if (VT == MVT::v32i8) {
21531 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21532 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21533 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21534 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21535 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21536 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21537 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21538 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21539 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21540 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21541 DAG.getConstant(8, dl, MVT::v16i16));
21542 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21543 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21544 DAG.getConstant(8, dl, MVT::v16i16));
21545 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21546 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21547 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21548 16, 17, 18, 19, 20, 21, 22, 23};
21549 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21550 24, 25, 26, 27, 28, 29, 30, 31};
21551 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21552 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21553 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21556 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21557 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21558 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21559 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21560 DAG.getConstant(8, dl, MVT::v16i16));
21561 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21562 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21563 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21566 assert(VT == MVT::v16i8 &&
21567 "Pre-AVX2 support only supports v16i8 multiplication");
21568 MVT ExVT = MVT::v8i16;
21570 // Extract the lo parts and zero/sign extend to i16.
21572 if (Subtarget.hasSSE41()) {
21573 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21574 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21576 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21577 -1, 4, -1, 5, -1, 6, -1, 7};
21578 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21579 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21580 ALo = DAG.getBitcast(ExVT, ALo);
21581 BLo = DAG.getBitcast(ExVT, BLo);
21582 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21583 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21586 // Extract the hi parts and zero/sign extend to i16.
21588 if (Subtarget.hasSSE41()) {
21589 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21590 -1, -1, -1, -1, -1, -1, -1, -1};
21591 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21592 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21593 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21594 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21596 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21597 -1, 12, -1, 13, -1, 14, -1, 15};
21598 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21599 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21600 AHi = DAG.getBitcast(ExVT, AHi);
21601 BHi = DAG.getBitcast(ExVT, BHi);
21602 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21603 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21606 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21607 // pack back to v16i8.
21608 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21609 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21610 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21611 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21612 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21615 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21616 assert(Subtarget.isTargetWin64() && "Unexpected target");
21617 EVT VT = Op.getValueType();
21618 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21619 "Unexpected return type for lowering");
21623 switch (Op->getOpcode()) {
21624 default: llvm_unreachable("Unexpected request for libcall!");
21625 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21626 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21627 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21628 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21629 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21630 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21634 SDValue InChain = DAG.getEntryNode();
21636 TargetLowering::ArgListTy Args;
21637 TargetLowering::ArgListEntry Entry;
21638 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21639 EVT ArgVT = Op->getOperand(i).getValueType();
21640 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21641 "Unexpected argument type for lowering");
21642 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21643 Entry.Node = StackPtr;
21644 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21645 MachinePointerInfo(), /* Alignment = */ 16);
21646 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21647 Entry.Ty = PointerType::get(ArgTy,0);
21648 Entry.IsSExt = false;
21649 Entry.IsZExt = false;
21650 Args.push_back(Entry);
21653 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21654 getPointerTy(DAG.getDataLayout()));
21656 TargetLowering::CallLoweringInfo CLI(DAG);
21657 CLI.setDebugLoc(dl)
21660 getLibcallCallingConv(LC),
21661 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21664 .setSExtResult(isSigned)
21665 .setZExtResult(!isSigned);
21667 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21668 return DAG.getBitcast(VT, CallInfo.first);
21671 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21672 SelectionDAG &DAG) {
21673 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21674 MVT VT = Op0.getSimpleValueType();
21677 // Decompose 256-bit ops into smaller 128-bit ops.
21678 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21679 unsigned Opcode = Op.getOpcode();
21680 unsigned NumElems = VT.getVectorNumElements();
21681 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21682 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21683 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21684 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21685 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21686 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21687 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21689 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21690 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21692 return DAG.getMergeValues(Ops, dl);
21695 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21696 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21698 // PMULxD operations multiply each even value (starting at 0) of LHS with
21699 // the related value of RHS and produce a widen result.
21700 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21701 // => <2 x i64> <ae|cg>
21703 // In other word, to have all the results, we need to perform two PMULxD:
21704 // 1. one with the even values.
21705 // 2. one with the odd values.
21706 // To achieve #2, with need to place the odd values at an even position.
21708 // Place the odd value at an even position (basically, shift all values 1
21709 // step to the left):
21710 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21711 // <a|b|c|d> => <b|undef|d|undef>
21712 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21713 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21714 // <e|f|g|h> => <f|undef|h|undef>
21715 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21716 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21718 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21720 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21721 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21723 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21724 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21725 // => <2 x i64> <ae|cg>
21726 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21727 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21728 // => <2 x i64> <bf|dh>
21729 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21731 // Shuffle it back into the right order.
21732 SDValue Highs, Lows;
21733 if (VT == MVT::v8i32) {
21734 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21735 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21736 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21737 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21739 const int HighMask[] = {1, 5, 3, 7};
21740 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21741 const int LowMask[] = {0, 4, 2, 6};
21742 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21745 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21746 // unsigned multiply.
21747 if (IsSigned && !Subtarget.hasSSE41()) {
21748 SDValue ShAmt = DAG.getConstant(
21750 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21751 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21752 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21753 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21754 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21756 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21757 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21760 // The first result of MUL_LOHI is actually the low value, followed by the
21762 SDValue Ops[] = {Lows, Highs};
21763 return DAG.getMergeValues(Ops, dl);
21766 // Return true if the required (according to Opcode) shift-imm form is natively
21767 // supported by the Subtarget
21768 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21770 if (VT.getScalarSizeInBits() < 16)
21773 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21774 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21777 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21778 (VT.is256BitVector() && Subtarget.hasInt256());
21780 bool AShift = LShift && (Subtarget.hasAVX512() ||
21781 (VT != MVT::v2i64 && VT != MVT::v4i64));
21782 return (Opcode == ISD::SRA) ? AShift : LShift;
21785 // The shift amount is a variable, but it is the same for all vector lanes.
21786 // These instructions are defined together with shift-immediate.
21788 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21790 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21793 // Return true if the required (according to Opcode) variable-shift form is
21794 // natively supported by the Subtarget
21795 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21798 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21801 // vXi16 supported only on AVX-512, BWI
21802 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21805 if (Subtarget.hasAVX512())
21808 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21809 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21810 return (Opcode == ISD::SRA) ? AShift : LShift;
21813 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21814 const X86Subtarget &Subtarget) {
21815 MVT VT = Op.getSimpleValueType();
21817 SDValue R = Op.getOperand(0);
21818 SDValue Amt = Op.getOperand(1);
21820 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21821 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21823 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21824 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21825 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21826 SDValue Ex = DAG.getBitcast(ExVT, R);
21828 if (ShiftAmt >= 32) {
21829 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21831 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21832 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21833 ShiftAmt - 32, DAG);
21834 if (VT == MVT::v2i64)
21835 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21836 if (VT == MVT::v4i64)
21837 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21838 {9, 1, 11, 3, 13, 5, 15, 7});
21840 // SRA upper i32, SHL whole i64 and select lower i32.
21841 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21844 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21845 Lower = DAG.getBitcast(ExVT, Lower);
21846 if (VT == MVT::v2i64)
21847 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21848 if (VT == MVT::v4i64)
21849 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21850 {8, 1, 10, 3, 12, 5, 14, 7});
21852 return DAG.getBitcast(VT, Ex);
21855 // Optimize shl/srl/sra with constant shift amount.
21856 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21857 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21858 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21860 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21861 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21863 // i64 SRA needs to be performed as partial shifts.
21864 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21865 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21866 return ArithmeticShiftRight64(ShiftAmt);
21868 if (VT == MVT::v16i8 ||
21869 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21870 VT == MVT::v64i8) {
21871 unsigned NumElts = VT.getVectorNumElements();
21872 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21874 // Simple i8 add case
21875 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21876 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21878 // ashr(R, 7) === cmp_slt(R, 0)
21879 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21880 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21881 if (VT.is512BitVector()) {
21882 assert(VT == MVT::v64i8 && "Unexpected element type!");
21883 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21884 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21886 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21889 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21890 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21893 if (Op.getOpcode() == ISD::SHL) {
21894 // Make a large shift.
21895 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21897 SHL = DAG.getBitcast(VT, SHL);
21898 // Zero out the rightmost bits.
21899 return DAG.getNode(ISD::AND, dl, VT, SHL,
21900 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21902 if (Op.getOpcode() == ISD::SRL) {
21903 // Make a large shift.
21904 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21906 SRL = DAG.getBitcast(VT, SRL);
21907 // Zero out the leftmost bits.
21908 return DAG.getNode(ISD::AND, dl, VT, SRL,
21909 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21911 if (Op.getOpcode() == ISD::SRA) {
21912 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21913 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21915 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21916 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21917 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21920 llvm_unreachable("Unknown shift opcode.");
21925 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21926 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21927 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21928 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21930 // Peek through any splat that was introduced for i64 shift vectorization.
21931 int SplatIndex = -1;
21932 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21933 if (SVN->isSplat()) {
21934 SplatIndex = SVN->getSplatIndex();
21935 Amt = Amt.getOperand(0);
21936 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21937 "Splat shuffle referencing second operand");
21940 if (Amt.getOpcode() != ISD::BITCAST ||
21941 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21944 Amt = Amt.getOperand(0);
21945 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21946 VT.getVectorNumElements();
21947 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21948 uint64_t ShiftAmt = 0;
21949 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21950 for (unsigned i = 0; i != Ratio; ++i) {
21951 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21955 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21958 // Check remaining shift amounts (if not a splat).
21959 if (SplatIndex < 0) {
21960 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21961 uint64_t ShAmt = 0;
21962 for (unsigned j = 0; j != Ratio; ++j) {
21963 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21967 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21969 if (ShAmt != ShiftAmt)
21974 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21975 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21977 if (Op.getOpcode() == ISD::SRA)
21978 return ArithmeticShiftRight64(ShiftAmt);
21984 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21985 const X86Subtarget &Subtarget) {
21986 MVT VT = Op.getSimpleValueType();
21988 SDValue R = Op.getOperand(0);
21989 SDValue Amt = Op.getOperand(1);
21991 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21992 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21994 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21995 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21997 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21999 MVT EltVT = VT.getVectorElementType();
22001 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22002 // Check if this build_vector node is doing a splat.
22003 // If so, then set BaseShAmt equal to the splat value.
22004 BaseShAmt = BV->getSplatValue();
22005 if (BaseShAmt && BaseShAmt.isUndef())
22006 BaseShAmt = SDValue();
22008 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22009 Amt = Amt.getOperand(0);
22011 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22012 if (SVN && SVN->isSplat()) {
22013 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22014 SDValue InVec = Amt.getOperand(0);
22015 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22016 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22017 "Unexpected shuffle index found!");
22018 BaseShAmt = InVec.getOperand(SplatIdx);
22019 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22020 if (ConstantSDNode *C =
22021 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22022 if (C->getZExtValue() == SplatIdx)
22023 BaseShAmt = InVec.getOperand(1);
22028 // Avoid introducing an extract element from a shuffle.
22029 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22030 DAG.getIntPtrConstant(SplatIdx, dl));
22034 if (BaseShAmt.getNode()) {
22035 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22036 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22037 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22038 else if (EltVT.bitsLT(MVT::i32))
22039 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22041 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22045 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22046 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22047 Amt.getOpcode() == ISD::BITCAST &&
22048 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22049 Amt = Amt.getOperand(0);
22050 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22051 VT.getVectorNumElements();
22052 std::vector<SDValue> Vals(Ratio);
22053 for (unsigned i = 0; i != Ratio; ++i)
22054 Vals[i] = Amt.getOperand(i);
22055 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22056 for (unsigned j = 0; j != Ratio; ++j)
22057 if (Vals[j] != Amt.getOperand(i + j))
22061 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22062 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22067 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22068 SelectionDAG &DAG) {
22069 MVT VT = Op.getSimpleValueType();
22071 SDValue R = Op.getOperand(0);
22072 SDValue Amt = Op.getOperand(1);
22073 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22075 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22076 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22078 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22081 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22084 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22087 // XOP has 128-bit variable logical/arithmetic shifts.
22088 // +ve/-ve Amt = shift left/right.
22089 if (Subtarget.hasXOP() &&
22090 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22091 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22092 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22093 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22094 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22096 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22097 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22098 if (Op.getOpcode() == ISD::SRA)
22099 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22102 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22103 // shifts per-lane and then shuffle the partial results back together.
22104 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22105 // Splat the shift amounts so the scalar shifts above will catch it.
22106 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22107 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22108 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22109 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22110 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22113 // i64 vector arithmetic shift can be emulated with the transform:
22114 // M = lshr(SIGN_MASK, Amt)
22115 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22116 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22117 Op.getOpcode() == ISD::SRA) {
22118 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22119 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22120 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22121 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22122 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22126 // If possible, lower this packed shift into a vector multiply instead of
22127 // expanding it into a sequence of scalar shifts.
22128 // Do this only if the vector shift count is a constant build_vector.
22129 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22130 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22131 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22132 SmallVector<SDValue, 8> Elts;
22133 MVT SVT = VT.getVectorElementType();
22134 unsigned SVTBits = SVT.getSizeInBits();
22135 APInt One(SVTBits, 1);
22136 unsigned NumElems = VT.getVectorNumElements();
22138 for (unsigned i=0; i !=NumElems; ++i) {
22139 SDValue Op = Amt->getOperand(i);
22140 if (Op->isUndef()) {
22141 Elts.push_back(Op);
22145 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22146 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22147 uint64_t ShAmt = C.getZExtValue();
22148 if (ShAmt >= SVTBits) {
22149 Elts.push_back(DAG.getUNDEF(SVT));
22152 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22154 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22155 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22158 // Lower SHL with variable shift amount.
22159 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22160 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22162 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22163 DAG.getConstant(0x3f800000U, dl, VT));
22164 Op = DAG.getBitcast(MVT::v4f32, Op);
22165 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22166 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22169 // If possible, lower this shift as a sequence of two shifts by
22170 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22172 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22174 // Could be rewritten as:
22175 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22177 // The advantage is that the two shifts from the example would be
22178 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22179 // the vector shift into four scalar shifts plus four pairs of vector
22181 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22182 unsigned TargetOpcode = X86ISD::MOVSS;
22183 bool CanBeSimplified;
22184 // The splat value for the first packed shift (the 'X' from the example).
22185 SDValue Amt1 = Amt->getOperand(0);
22186 // The splat value for the second packed shift (the 'Y' from the example).
22187 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22189 // See if it is possible to replace this node with a sequence of
22190 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22191 if (VT == MVT::v4i32) {
22192 // Check if it is legal to use a MOVSS.
22193 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22194 Amt2 == Amt->getOperand(3);
22195 if (!CanBeSimplified) {
22196 // Otherwise, check if we can still simplify this node using a MOVSD.
22197 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22198 Amt->getOperand(2) == Amt->getOperand(3);
22199 TargetOpcode = X86ISD::MOVSD;
22200 Amt2 = Amt->getOperand(2);
22203 // Do similar checks for the case where the machine value type
22205 CanBeSimplified = Amt1 == Amt->getOperand(1);
22206 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22207 CanBeSimplified = Amt2 == Amt->getOperand(i);
22209 if (!CanBeSimplified) {
22210 TargetOpcode = X86ISD::MOVSD;
22211 CanBeSimplified = true;
22212 Amt2 = Amt->getOperand(4);
22213 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22214 CanBeSimplified = Amt1 == Amt->getOperand(i);
22215 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22216 CanBeSimplified = Amt2 == Amt->getOperand(j);
22220 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22221 isa<ConstantSDNode>(Amt2)) {
22222 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22223 MVT CastVT = MVT::v4i32;
22225 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22226 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22228 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22229 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22230 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22231 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22232 if (TargetOpcode == X86ISD::MOVSD)
22233 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22234 BitCast2, {0, 1, 6, 7}));
22235 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22236 BitCast2, {0, 5, 6, 7}));
22240 // v4i32 Non Uniform Shifts.
22241 // If the shift amount is constant we can shift each lane using the SSE2
22242 // immediate shifts, else we need to zero-extend each lane to the lower i64
22243 // and shift using the SSE2 variable shifts.
22244 // The separate results can then be blended together.
22245 if (VT == MVT::v4i32) {
22246 unsigned Opc = Op.getOpcode();
22247 SDValue Amt0, Amt1, Amt2, Amt3;
22249 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22250 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22251 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22252 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22254 // ISD::SHL is handled above but we include it here for completeness.
22257 llvm_unreachable("Unknown target vector shift node");
22259 Opc = X86ISD::VSHL;
22262 Opc = X86ISD::VSRL;
22265 Opc = X86ISD::VSRA;
22268 // The SSE2 shifts use the lower i64 as the same shift amount for
22269 // all lanes and the upper i64 is ignored. These shuffle masks
22270 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22271 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22272 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22273 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22274 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22275 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22278 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22279 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22280 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22281 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22282 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22283 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22284 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22287 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22288 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22289 // make the existing SSE solution better.
22290 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22291 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22292 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22293 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22294 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22295 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22297 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22298 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22299 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22300 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22301 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22304 if (VT == MVT::v16i8 ||
22305 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22306 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22307 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22308 unsigned ShiftOpcode = Op->getOpcode();
22310 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22311 if (VT.is512BitVector()) {
22312 // On AVX512BW targets we make use of the fact that VSELECT lowers
22313 // to a masked blend which selects bytes based just on the sign bit
22314 // extracted to a mask.
22315 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22316 V0 = DAG.getBitcast(VT, V0);
22317 V1 = DAG.getBitcast(VT, V1);
22318 Sel = DAG.getBitcast(VT, Sel);
22319 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22320 return DAG.getBitcast(SelVT,
22321 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22322 } else if (Subtarget.hasSSE41()) {
22323 // On SSE41 targets we make use of the fact that VSELECT lowers
22324 // to PBLENDVB which selects bytes based just on the sign bit.
22325 V0 = DAG.getBitcast(VT, V0);
22326 V1 = DAG.getBitcast(VT, V1);
22327 Sel = DAG.getBitcast(VT, Sel);
22328 return DAG.getBitcast(SelVT,
22329 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
22331 // On pre-SSE41 targets we test for the sign bit by comparing to
22332 // zero - a negative value will set all bits of the lanes to true
22333 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22334 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22335 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22336 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
22339 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22340 // We can safely do this using i16 shifts as we're only interested in
22341 // the 3 lower bits of each byte.
22342 Amt = DAG.getBitcast(ExtVT, Amt);
22343 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22344 Amt = DAG.getBitcast(VT, Amt);
22346 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22347 // r = VSELECT(r, shift(r, 4), a);
22349 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22350 R = SignBitSelect(VT, Amt, M, R);
22353 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22355 // r = VSELECT(r, shift(r, 2), a);
22356 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22357 R = SignBitSelect(VT, Amt, M, R);
22360 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22362 // return VSELECT(r, shift(r, 1), a);
22363 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22364 R = SignBitSelect(VT, Amt, M, R);
22368 if (Op->getOpcode() == ISD::SRA) {
22369 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22370 // so we can correctly sign extend. We don't care what happens to the
22372 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22373 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22374 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22375 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22376 ALo = DAG.getBitcast(ExtVT, ALo);
22377 AHi = DAG.getBitcast(ExtVT, AHi);
22378 RLo = DAG.getBitcast(ExtVT, RLo);
22379 RHi = DAG.getBitcast(ExtVT, RHi);
22381 // r = VSELECT(r, shift(r, 4), a);
22382 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22383 DAG.getConstant(4, dl, ExtVT));
22384 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22385 DAG.getConstant(4, dl, ExtVT));
22386 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22387 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22390 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22391 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22393 // r = VSELECT(r, shift(r, 2), a);
22394 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22395 DAG.getConstant(2, dl, ExtVT));
22396 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22397 DAG.getConstant(2, dl, ExtVT));
22398 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22399 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22402 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22403 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22405 // r = VSELECT(r, shift(r, 1), a);
22406 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22407 DAG.getConstant(1, dl, ExtVT));
22408 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22409 DAG.getConstant(1, dl, ExtVT));
22410 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22411 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22413 // Logical shift the result back to the lower byte, leaving a zero upper
22415 // meaning that we can safely pack with PACKUSWB.
22417 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22419 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22420 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22424 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22425 MVT ExtVT = MVT::v8i32;
22426 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22427 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22428 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22429 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22430 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22431 ALo = DAG.getBitcast(ExtVT, ALo);
22432 AHi = DAG.getBitcast(ExtVT, AHi);
22433 RLo = DAG.getBitcast(ExtVT, RLo);
22434 RHi = DAG.getBitcast(ExtVT, RHi);
22435 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22436 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22437 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22438 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22439 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22442 if (VT == MVT::v8i16) {
22443 unsigned ShiftOpcode = Op->getOpcode();
22445 // If we have a constant shift amount, the non-SSE41 path is best as
22446 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22447 bool UseSSE41 = Subtarget.hasSSE41() &&
22448 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22450 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22451 // On SSE41 targets we make use of the fact that VSELECT lowers
22452 // to PBLENDVB which selects bytes based just on the sign bit.
22454 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22455 V0 = DAG.getBitcast(ExtVT, V0);
22456 V1 = DAG.getBitcast(ExtVT, V1);
22457 Sel = DAG.getBitcast(ExtVT, Sel);
22458 return DAG.getBitcast(
22459 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
22461 // On pre-SSE41 targets we splat the sign bit - a negative value will
22462 // set all bits of the lanes to true and VSELECT uses that in
22463 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22465 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22466 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
22469 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22471 // On SSE41 targets we need to replicate the shift mask in both
22472 // bytes for PBLENDVB.
22475 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22476 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22478 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22481 // r = VSELECT(r, shift(r, 8), a);
22482 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22483 R = SignBitSelect(Amt, M, R);
22486 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22488 // r = VSELECT(r, shift(r, 4), a);
22489 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22490 R = SignBitSelect(Amt, M, R);
22493 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22495 // r = VSELECT(r, shift(r, 2), a);
22496 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22497 R = SignBitSelect(Amt, M, R);
22500 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22502 // return VSELECT(r, shift(r, 1), a);
22503 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22504 R = SignBitSelect(Amt, M, R);
22508 // Decompose 256-bit shifts into smaller 128-bit shifts.
22509 if (VT.is256BitVector())
22510 return Lower256IntArith(Op, DAG);
22515 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22516 SelectionDAG &DAG) {
22517 MVT VT = Op.getSimpleValueType();
22519 SDValue R = Op.getOperand(0);
22520 SDValue Amt = Op.getOperand(1);
22522 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22523 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22524 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22526 // XOP has 128-bit vector variable + immediate rotates.
22527 // +ve/-ve Amt = rotate left/right.
22529 // Split 256-bit integers.
22530 if (VT.is256BitVector())
22531 return Lower256IntArith(Op, DAG);
22533 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22535 // Attempt to rotate by immediate.
22536 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22537 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22538 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22539 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22540 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22541 DAG.getConstant(RotateAmt, DL, MVT::i8));
22545 // Use general rotate by variable (per-element).
22546 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22549 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22550 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22551 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22552 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22553 // has only one use.
22554 SDNode *N = Op.getNode();
22555 SDValue LHS = N->getOperand(0);
22556 SDValue RHS = N->getOperand(1);
22557 unsigned BaseOp = 0;
22558 X86::CondCode Cond;
22560 switch (Op.getOpcode()) {
22561 default: llvm_unreachable("Unknown ovf instruction!");
22563 // A subtract of one will be selected as a INC. Note that INC doesn't
22564 // set CF, so we can't do this for UADDO.
22565 if (isOneConstant(RHS)) {
22566 BaseOp = X86ISD::INC;
22567 Cond = X86::COND_O;
22570 BaseOp = X86ISD::ADD;
22571 Cond = X86::COND_O;
22574 BaseOp = X86ISD::ADD;
22575 Cond = X86::COND_B;
22578 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22579 // set CF, so we can't do this for USUBO.
22580 if (isOneConstant(RHS)) {
22581 BaseOp = X86ISD::DEC;
22582 Cond = X86::COND_O;
22585 BaseOp = X86ISD::SUB;
22586 Cond = X86::COND_O;
22589 BaseOp = X86ISD::SUB;
22590 Cond = X86::COND_B;
22593 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22594 Cond = X86::COND_O;
22596 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22597 if (N->getValueType(0) == MVT::i8) {
22598 BaseOp = X86ISD::UMUL8;
22599 Cond = X86::COND_O;
22602 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22604 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22606 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22608 if (N->getValueType(1) == MVT::i1)
22609 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22611 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22615 // Also sets EFLAGS.
22616 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22617 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22619 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22621 if (N->getValueType(1) == MVT::i1)
22622 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22624 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22627 /// Returns true if the operand type is exactly twice the native width, and
22628 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22629 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22630 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22631 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22632 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22635 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22636 else if (OpWidth == 128)
22637 return Subtarget.hasCmpxchg16b();
22642 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22643 return needsCmpXchgNb(SI->getValueOperand()->getType());
22646 // Note: this turns large loads into lock cmpxchg8b/16b.
22647 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22648 TargetLowering::AtomicExpansionKind
22649 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22650 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22651 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22652 : AtomicExpansionKind::None;
22655 TargetLowering::AtomicExpansionKind
22656 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22657 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22658 Type *MemType = AI->getType();
22660 // If the operand is too big, we must see if cmpxchg8/16b is available
22661 // and default to library calls otherwise.
22662 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22663 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22664 : AtomicExpansionKind::None;
22667 AtomicRMWInst::BinOp Op = AI->getOperation();
22670 llvm_unreachable("Unknown atomic operation");
22671 case AtomicRMWInst::Xchg:
22672 case AtomicRMWInst::Add:
22673 case AtomicRMWInst::Sub:
22674 // It's better to use xadd, xsub or xchg for these in all cases.
22675 return AtomicExpansionKind::None;
22676 case AtomicRMWInst::Or:
22677 case AtomicRMWInst::And:
22678 case AtomicRMWInst::Xor:
22679 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22680 // prefix to a normal instruction for these operations.
22681 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22682 : AtomicExpansionKind::None;
22683 case AtomicRMWInst::Nand:
22684 case AtomicRMWInst::Max:
22685 case AtomicRMWInst::Min:
22686 case AtomicRMWInst::UMax:
22687 case AtomicRMWInst::UMin:
22688 // These always require a non-trivial set of data operations on x86. We must
22689 // use a cmpxchg loop.
22690 return AtomicExpansionKind::CmpXChg;
22695 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22696 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22697 Type *MemType = AI->getType();
22698 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22699 // there is no benefit in turning such RMWs into loads, and it is actually
22700 // harmful as it introduces a mfence.
22701 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22704 auto Builder = IRBuilder<>(AI);
22705 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22706 auto SynchScope = AI->getSynchScope();
22707 // We must restrict the ordering to avoid generating loads with Release or
22708 // ReleaseAcquire orderings.
22709 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22710 auto Ptr = AI->getPointerOperand();
22712 // Before the load we need a fence. Here is an example lifted from
22713 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22716 // x.store(1, relaxed);
22717 // r1 = y.fetch_add(0, release);
22719 // y.fetch_add(42, acquire);
22720 // r2 = x.load(relaxed);
22721 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22722 // lowered to just a load without a fence. A mfence flushes the store buffer,
22723 // making the optimization clearly correct.
22724 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22725 // otherwise, we might be able to be more aggressive on relaxed idempotent
22726 // rmw. In practice, they do not look useful, so we don't try to be
22727 // especially clever.
22728 if (SynchScope == SingleThread)
22729 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22730 // the IR level, so we must wrap it in an intrinsic.
22733 if (!Subtarget.hasMFence())
22734 // FIXME: it might make sense to use a locked operation here but on a
22735 // different cache-line to prevent cache-line bouncing. In practice it
22736 // is probably a small win, and x86 processors without mfence are rare
22737 // enough that we do not bother.
22741 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22742 Builder.CreateCall(MFence, {});
22744 // Finally we can emit the atomic load.
22745 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22746 AI->getType()->getPrimitiveSizeInBits());
22747 Loaded->setAtomic(Order, SynchScope);
22748 AI->replaceAllUsesWith(Loaded);
22749 AI->eraseFromParent();
22753 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22754 SelectionDAG &DAG) {
22756 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22757 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22758 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22759 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22761 // The only fence that needs an instruction is a sequentially-consistent
22762 // cross-thread fence.
22763 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22764 FenceScope == CrossThread) {
22765 if (Subtarget.hasMFence())
22766 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22768 SDValue Chain = Op.getOperand(0);
22769 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22771 DAG.getRegister(X86::ESP, MVT::i32), // Base
22772 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22773 DAG.getRegister(0, MVT::i32), // Index
22774 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22775 DAG.getRegister(0, MVT::i32), // Segment.
22779 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22780 return SDValue(Res, 0);
22783 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22784 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22787 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22788 SelectionDAG &DAG) {
22789 MVT T = Op.getSimpleValueType();
22793 switch(T.SimpleTy) {
22794 default: llvm_unreachable("Invalid value type!");
22795 case MVT::i8: Reg = X86::AL; size = 1; break;
22796 case MVT::i16: Reg = X86::AX; size = 2; break;
22797 case MVT::i32: Reg = X86::EAX; size = 4; break;
22799 assert(Subtarget.is64Bit() && "Node not type legal!");
22800 Reg = X86::RAX; size = 8;
22803 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22804 Op.getOperand(2), SDValue());
22805 SDValue Ops[] = { cpIn.getValue(0),
22808 DAG.getTargetConstant(size, DL, MVT::i8),
22809 cpIn.getValue(1) };
22810 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22811 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22812 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22816 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22817 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22818 MVT::i32, cpOut.getValue(2));
22819 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22821 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22822 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22823 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22827 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22828 SelectionDAG &DAG) {
22829 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22830 MVT DstVT = Op.getSimpleValueType();
22832 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22833 SrcVT == MVT::i64) {
22834 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22835 if (DstVT != MVT::f64)
22836 // This conversion needs to be expanded.
22839 SDValue Op0 = Op->getOperand(0);
22840 SmallVector<SDValue, 16> Elts;
22844 if (SrcVT.isVector()) {
22845 NumElts = SrcVT.getVectorNumElements();
22846 SVT = SrcVT.getVectorElementType();
22848 // Widen the vector in input in the case of MVT::v2i32.
22849 // Example: from MVT::v2i32 to MVT::v4i32.
22850 for (unsigned i = 0, e = NumElts; i != e; ++i)
22851 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22852 DAG.getIntPtrConstant(i, dl)));
22854 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22855 "Unexpected source type in LowerBITCAST");
22856 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22857 DAG.getIntPtrConstant(0, dl)));
22858 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22859 DAG.getIntPtrConstant(1, dl)));
22863 // Explicitly mark the extra elements as Undef.
22864 Elts.append(NumElts, DAG.getUNDEF(SVT));
22866 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22867 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22868 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22869 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22870 DAG.getIntPtrConstant(0, dl));
22873 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22874 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22875 assert((DstVT == MVT::i64 ||
22876 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22877 "Unexpected custom BITCAST");
22878 // i64 <=> MMX conversions are Legal.
22879 if (SrcVT==MVT::i64 && DstVT.isVector())
22881 if (DstVT==MVT::i64 && SrcVT.isVector())
22883 // MMX <=> MMX conversions are Legal.
22884 if (SrcVT.isVector() && DstVT.isVector())
22886 // All other conversions need to be expanded.
22890 /// Compute the horizontal sum of bytes in V for the elements of VT.
22892 /// Requires V to be a byte vector and VT to be an integer vector type with
22893 /// wider elements than V's type. The width of the elements of VT determines
22894 /// how many bytes of V are summed horizontally to produce each element of the
22896 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22897 const X86Subtarget &Subtarget,
22898 SelectionDAG &DAG) {
22900 MVT ByteVecVT = V.getSimpleValueType();
22901 MVT EltVT = VT.getVectorElementType();
22902 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22903 "Expected value to have byte element type.");
22904 assert(EltVT != MVT::i8 &&
22905 "Horizontal byte sum only makes sense for wider elements!");
22906 unsigned VecSize = VT.getSizeInBits();
22907 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22909 // PSADBW instruction horizontally add all bytes and leave the result in i64
22910 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22911 if (EltVT == MVT::i64) {
22912 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22913 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22914 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22915 return DAG.getBitcast(VT, V);
22918 if (EltVT == MVT::i32) {
22919 // We unpack the low half and high half into i32s interleaved with zeros so
22920 // that we can use PSADBW to horizontally sum them. The most useful part of
22921 // this is that it lines up the results of two PSADBW instructions to be
22922 // two v2i64 vectors which concatenated are the 4 population counts. We can
22923 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22924 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22925 SDValue V32 = DAG.getBitcast(VT, V);
22926 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22927 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22929 // Do the horizontal sums into two v2i64s.
22930 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22931 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22932 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22933 DAG.getBitcast(ByteVecVT, Low), Zeros);
22934 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22935 DAG.getBitcast(ByteVecVT, High), Zeros);
22937 // Merge them together.
22938 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22939 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22940 DAG.getBitcast(ShortVecVT, Low),
22941 DAG.getBitcast(ShortVecVT, High));
22943 return DAG.getBitcast(VT, V);
22946 // The only element type left is i16.
22947 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22949 // To obtain pop count for each i16 element starting from the pop count for
22950 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22951 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22952 // directly supported.
22953 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22954 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22955 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22956 DAG.getBitcast(ByteVecVT, V));
22957 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22960 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22961 const X86Subtarget &Subtarget,
22962 SelectionDAG &DAG) {
22963 MVT VT = Op.getSimpleValueType();
22964 MVT EltVT = VT.getVectorElementType();
22965 unsigned VecSize = VT.getSizeInBits();
22967 // Implement a lookup table in register by using an algorithm based on:
22968 // http://wm.ite.pl/articles/sse-popcount.html
22970 // The general idea is that every lower byte nibble in the input vector is an
22971 // index into a in-register pre-computed pop count table. We then split up the
22972 // input vector in two new ones: (1) a vector with only the shifted-right
22973 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22974 // masked out higher ones) for each byte. PSHUFB is used separately with both
22975 // to index the in-register table. Next, both are added and the result is a
22976 // i8 vector where each element contains the pop count for input byte.
22978 // To obtain the pop count for elements != i8, we follow up with the same
22979 // approach and use additional tricks as described below.
22981 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22982 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22983 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22984 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22986 int NumByteElts = VecSize / 8;
22987 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22988 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22989 SmallVector<SDValue, 64> LUTVec;
22990 for (int i = 0; i < NumByteElts; ++i)
22991 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22992 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22993 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22996 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22997 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23000 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23002 // The input vector is used as the shuffle mask that index elements into the
23003 // LUT. After counting low and high nibbles, add the vector to obtain the
23004 // final pop count per i8 element.
23005 SDValue HighPopCnt =
23006 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23007 SDValue LowPopCnt =
23008 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23009 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23011 if (EltVT == MVT::i8)
23014 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23017 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23018 const X86Subtarget &Subtarget,
23019 SelectionDAG &DAG) {
23020 MVT VT = Op.getSimpleValueType();
23021 assert(VT.is128BitVector() &&
23022 "Only 128-bit vector bitmath lowering supported.");
23024 int VecSize = VT.getSizeInBits();
23025 MVT EltVT = VT.getVectorElementType();
23026 int Len = EltVT.getSizeInBits();
23028 // This is the vectorized version of the "best" algorithm from
23029 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23030 // with a minor tweak to use a series of adds + shifts instead of vector
23031 // multiplications. Implemented for all integer vector types. We only use
23032 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23033 // much faster, even faster than using native popcnt instructions.
23035 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23036 MVT VT = V.getSimpleValueType();
23037 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23038 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23040 auto GetMask = [&](SDValue V, APInt Mask) {
23041 MVT VT = V.getSimpleValueType();
23042 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23043 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23046 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23047 // x86, so set the SRL type to have elements at least i16 wide. This is
23048 // correct because all of our SRLs are followed immediately by a mask anyways
23049 // that handles any bits that sneak into the high bits of the byte elements.
23050 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23054 // v = v - ((v >> 1) & 0x55555555...)
23056 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23057 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23058 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23060 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23061 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23062 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23063 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23064 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23066 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23067 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23068 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23069 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23071 // At this point, V contains the byte-wise population count, and we are
23072 // merely doing a horizontal sum if necessary to get the wider element
23074 if (EltVT == MVT::i8)
23077 return LowerHorizontalByteSum(
23078 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23082 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23083 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23084 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23085 SelectionDAG &DAG) {
23086 MVT VT = Op.getSimpleValueType();
23087 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23088 "Unknown CTPOP type to handle");
23089 SDLoc DL(Op.getNode());
23090 SDValue Op0 = Op.getOperand(0);
23092 if (!Subtarget.hasSSSE3()) {
23093 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23094 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23095 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23098 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
23099 unsigned NumElems = VT.getVectorNumElements();
23101 // Extract each 128-bit vector, compute pop count and concat the result.
23102 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
23103 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
23105 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23106 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
23107 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
23110 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
23111 unsigned NumElems = VT.getVectorNumElements();
23113 // Extract each 256-bit vector, compute pop count and concat the result.
23114 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
23115 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
23117 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23118 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
23119 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
23122 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23125 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23126 SelectionDAG &DAG) {
23127 assert(Op.getSimpleValueType().isVector() &&
23128 "We only do custom lowering for vector population count.");
23129 return LowerVectorCTPOP(Op, Subtarget, DAG);
23132 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23133 MVT VT = Op.getSimpleValueType();
23134 SDValue In = Op.getOperand(0);
23137 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23138 // perform the BITREVERSE.
23139 if (!VT.isVector()) {
23140 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23141 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23142 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23144 DAG.getIntPtrConstant(0, DL));
23147 MVT SVT = VT.getVectorElementType();
23148 int NumElts = VT.getVectorNumElements();
23149 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23151 // Decompose 256-bit ops into smaller 128-bit ops.
23152 if (VT.is256BitVector()) {
23153 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
23154 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
23156 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
23157 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
23158 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
23159 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
23162 assert(VT.is128BitVector() &&
23163 "Only 128-bit vector bitreverse lowering supported.");
23165 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23166 // perform the BSWAP in the shuffle.
23167 // Its best to shuffle using the second operand as this will implicitly allow
23168 // memory folding for multiple vectors.
23169 SmallVector<SDValue, 16> MaskElts;
23170 for (int i = 0; i != NumElts; ++i) {
23171 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23172 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23173 int PermuteByte = SourceByte | (2 << 5);
23174 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23178 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23179 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23180 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23182 return DAG.getBitcast(VT, Res);
23185 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23186 SelectionDAG &DAG) {
23187 if (Subtarget.hasXOP())
23188 return LowerBITREVERSE_XOP(Op, DAG);
23190 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23192 MVT VT = Op.getSimpleValueType();
23193 SDValue In = Op.getOperand(0);
23196 unsigned NumElts = VT.getVectorNumElements();
23197 assert(VT.getScalarType() == MVT::i8 &&
23198 "Only byte vector BITREVERSE supported");
23200 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23201 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
23202 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
23203 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
23204 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
23205 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
23206 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
23207 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
23210 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23211 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23212 // 0-15 value (moved to the other nibble).
23213 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23214 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23215 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23217 const int LoLUT[16] = {
23218 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23219 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23220 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23221 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23222 const int HiLUT[16] = {
23223 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23224 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23225 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23226 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23228 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23229 for (unsigned i = 0; i < NumElts; ++i) {
23230 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23231 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23234 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23235 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23236 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23237 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23238 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23241 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23242 unsigned NewOpc = 0;
23243 switch (N->getOpcode()) {
23244 case ISD::ATOMIC_LOAD_ADD:
23245 NewOpc = X86ISD::LADD;
23247 case ISD::ATOMIC_LOAD_SUB:
23248 NewOpc = X86ISD::LSUB;
23250 case ISD::ATOMIC_LOAD_OR:
23251 NewOpc = X86ISD::LOR;
23253 case ISD::ATOMIC_LOAD_XOR:
23254 NewOpc = X86ISD::LXOR;
23256 case ISD::ATOMIC_LOAD_AND:
23257 NewOpc = X86ISD::LAND;
23260 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23263 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23264 return DAG.getMemIntrinsicNode(
23265 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23266 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23267 /*MemVT=*/N->getSimpleValueType(0), MMO);
23270 /// Lower atomic_load_ops into LOCK-prefixed operations.
23271 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23272 const X86Subtarget &Subtarget) {
23273 SDValue Chain = N->getOperand(0);
23274 SDValue LHS = N->getOperand(1);
23275 SDValue RHS = N->getOperand(2);
23276 unsigned Opc = N->getOpcode();
23277 MVT VT = N->getSimpleValueType(0);
23280 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23281 // can only be lowered when the result is unused. They should have already
23282 // been transformed into a cmpxchg loop in AtomicExpand.
23283 if (N->hasAnyUseOfValue(0)) {
23284 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23285 // select LXADD if LOCK_SUB can't be selected.
23286 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23287 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23288 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23289 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23290 RHS, AN->getMemOperand());
23292 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23293 "Used AtomicRMW ops other than Add should have been expanded!");
23297 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23298 // RAUW the chain, but don't worry about the result, as it's unused.
23299 assert(!N->hasAnyUseOfValue(0));
23300 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23304 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23305 SDNode *Node = Op.getNode();
23307 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23309 // Convert seq_cst store -> xchg
23310 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23311 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23312 // (The only way to get a 16-byte store is cmpxchg16b)
23313 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23314 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23315 AtomicOrdering::SequentiallyConsistent ||
23316 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23317 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23318 cast<AtomicSDNode>(Node)->getMemoryVT(),
23319 Node->getOperand(0),
23320 Node->getOperand(1), Node->getOperand(2),
23321 cast<AtomicSDNode>(Node)->getMemOperand());
23322 return Swap.getValue(1);
23324 // Other atomic stores have a simple pattern.
23328 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
23329 MVT VT = Op.getNode()->getSimpleValueType(0);
23331 // Let legalize expand this if it isn't a legal type yet.
23332 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23335 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23338 bool ExtraOp = false;
23339 switch (Op.getOpcode()) {
23340 default: llvm_unreachable("Invalid code");
23341 case ISD::ADDC: Opc = X86ISD::ADD; break;
23342 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
23343 case ISD::SUBC: Opc = X86ISD::SUB; break;
23344 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
23348 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23350 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
23351 Op.getOperand(1), Op.getOperand(2));
23354 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23355 SelectionDAG &DAG) {
23356 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23358 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23359 // which returns the values as { float, float } (in XMM0) or
23360 // { double, double } (which is returned in XMM0, XMM1).
23362 SDValue Arg = Op.getOperand(0);
23363 EVT ArgVT = Arg.getValueType();
23364 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23366 TargetLowering::ArgListTy Args;
23367 TargetLowering::ArgListEntry Entry;
23371 Entry.IsSExt = false;
23372 Entry.IsZExt = false;
23373 Args.push_back(Entry);
23375 bool isF64 = ArgVT == MVT::f64;
23376 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23377 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23378 // the results are returned via SRet in memory.
23379 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23382 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23384 Type *RetTy = isF64
23385 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
23386 : (Type*)VectorType::get(ArgTy, 4);
23388 TargetLowering::CallLoweringInfo CLI(DAG);
23389 CLI.setDebugLoc(dl)
23390 .setChain(DAG.getEntryNode())
23391 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23393 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23396 // Returned in xmm0 and xmm1.
23397 return CallResult.first;
23399 // Returned in bits 0:31 and 32:64 xmm0.
23400 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23401 CallResult.first, DAG.getIntPtrConstant(0, dl));
23402 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23403 CallResult.first, DAG.getIntPtrConstant(1, dl));
23404 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23405 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23408 /// Widen a vector input to a vector of NVT. The
23409 /// input vector must have the same element type as NVT.
23410 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23411 bool FillWithZeroes = false) {
23412 // Check if InOp already has the right width.
23413 MVT InVT = InOp.getSimpleValueType();
23417 if (InOp.isUndef())
23418 return DAG.getUNDEF(NVT);
23420 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23421 "input and widen element type must match");
23423 unsigned InNumElts = InVT.getVectorNumElements();
23424 unsigned WidenNumElts = NVT.getVectorNumElements();
23425 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23426 "Unexpected request for vector widening");
23428 EVT EltVT = NVT.getVectorElementType();
23431 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23432 InOp.getNumOperands() == 2) {
23433 SDValue N1 = InOp.getOperand(1);
23434 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23436 InOp = InOp.getOperand(0);
23437 InVT = InOp.getSimpleValueType();
23438 InNumElts = InVT.getVectorNumElements();
23441 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23442 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23443 SmallVector<SDValue, 16> Ops;
23444 for (unsigned i = 0; i < InNumElts; ++i)
23445 Ops.push_back(InOp.getOperand(i));
23447 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23448 DAG.getUNDEF(EltVT);
23449 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23450 Ops.push_back(FillVal);
23451 return DAG.getBuildVector(NVT, dl, Ops);
23453 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23455 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23456 InOp, DAG.getIntPtrConstant(0, dl));
23459 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23460 SelectionDAG &DAG) {
23461 assert(Subtarget.hasAVX512() &&
23462 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23464 // X86 scatter kills mask register, so its type should be added to
23465 // the list of return values.
23466 // If the "scatter" has 2 return values, it is already handled.
23467 if (Op.getNode()->getNumValues() == 2)
23470 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23471 SDValue Src = N->getValue();
23472 MVT VT = Src.getSimpleValueType();
23473 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23476 SDValue NewScatter;
23477 SDValue Index = N->getIndex();
23478 SDValue Mask = N->getMask();
23479 SDValue Chain = N->getChain();
23480 SDValue BasePtr = N->getBasePtr();
23481 MVT MemVT = N->getMemoryVT().getSimpleVT();
23482 MVT IndexVT = Index.getSimpleValueType();
23483 MVT MaskVT = Mask.getSimpleValueType();
23485 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23486 // The v2i32 value was promoted to v2i64.
23487 // Now we "redo" the type legalizer's work and widen the original
23488 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23490 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23491 "Unexpected memory type");
23492 int ShuffleMask[] = {0, 2, -1, -1};
23493 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23494 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23495 // Now we have 4 elements instead of 2.
23496 // Expand the index.
23497 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23498 Index = ExtendToType(Index, NewIndexVT, DAG);
23500 // Expand the mask with zeroes
23501 // Mask may be <2 x i64> or <2 x i1> at this moment
23502 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23503 "Unexpected mask type");
23504 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23505 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23509 unsigned NumElts = VT.getVectorNumElements();
23510 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23511 !Index.getSimpleValueType().is512BitVector()) {
23512 // AVX512F supports only 512-bit vectors. Or data or index should
23513 // be 512 bit wide. If now the both index and data are 256-bit, but
23514 // the vector contains 8 elements, we just sign-extend the index
23515 if (IndexVT == MVT::v8i32)
23516 // Just extend index
23517 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23519 // The minimal number of elts in scatter is 8
23522 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23523 // Use original index here, do not modify the index twice
23524 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23525 if (IndexVT.getScalarType() == MVT::i32)
23526 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23529 // At this point we have promoted mask operand
23530 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23531 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23532 // Use the original mask here, do not modify the mask twice
23533 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23535 // The value that should be stored
23536 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23537 Src = ExtendToType(Src, NewVT, DAG);
23540 // If the mask is "wide" at this point - truncate it to i1 vector
23541 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23542 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23544 // The mask is killed by scatter, add it to the values
23545 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23546 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23547 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23548 N->getMemOperand());
23549 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23550 return SDValue(NewScatter.getNode(), 1);
23553 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23554 SelectionDAG &DAG) {
23556 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23557 MVT VT = Op.getSimpleValueType();
23558 MVT ScalarVT = VT.getScalarType();
23559 SDValue Mask = N->getMask();
23562 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23563 "Expanding masked load is supported on AVX-512 target only!");
23565 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23566 "Expanding masked load is supported for 32 and 64-bit types only!");
23568 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23569 // VLX. These types for exp-loads are handled here.
23570 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23573 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23574 "Cannot lower masked load op.");
23576 assert((ScalarVT.getSizeInBits() >= 32 ||
23577 (Subtarget.hasBWI() &&
23578 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23579 "Unsupported masked load op.");
23581 // This operation is legal for targets with VLX, but without
23582 // VLX the vector should be widened to 512 bit
23583 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23584 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23585 SDValue Src0 = N->getSrc0();
23586 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23588 // Mask element has to be i1.
23589 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23590 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23591 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23593 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23595 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23596 if (MaskEltTy != MVT::i1)
23597 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23598 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23599 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23600 N->getBasePtr(), Mask, Src0,
23601 N->getMemoryVT(), N->getMemOperand(),
23602 N->getExtensionType(),
23603 N->isExpandingLoad());
23605 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23606 NewLoad.getValue(0),
23607 DAG.getIntPtrConstant(0, dl));
23608 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23609 return DAG.getMergeValues(RetOps, dl);
23612 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23613 SelectionDAG &DAG) {
23614 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23615 SDValue DataToStore = N->getValue();
23616 MVT VT = DataToStore.getSimpleValueType();
23617 MVT ScalarVT = VT.getScalarType();
23618 SDValue Mask = N->getMask();
23621 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23622 "Expanding masked load is supported on AVX-512 target only!");
23624 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23625 "Expanding masked load is supported for 32 and 64-bit types only!");
23627 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23628 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23631 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23632 "Cannot lower masked store op.");
23634 assert((ScalarVT.getSizeInBits() >= 32 ||
23635 (Subtarget.hasBWI() &&
23636 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23637 "Unsupported masked store op.");
23639 // This operation is legal for targets with VLX, but without
23640 // VLX the vector should be widened to 512 bit
23641 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23642 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23644 // Mask element has to be i1.
23645 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23646 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23647 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23649 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23651 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23652 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23653 if (MaskEltTy != MVT::i1)
23654 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23655 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23656 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23657 Mask, N->getMemoryVT(), N->getMemOperand(),
23658 N->isTruncatingStore(), N->isCompressingStore());
23661 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23662 SelectionDAG &DAG) {
23663 assert(Subtarget.hasAVX512() &&
23664 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23666 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23668 MVT VT = Op.getSimpleValueType();
23669 SDValue Index = N->getIndex();
23670 SDValue Mask = N->getMask();
23671 SDValue Src0 = N->getValue();
23672 MVT IndexVT = Index.getSimpleValueType();
23673 MVT MaskVT = Mask.getSimpleValueType();
23675 unsigned NumElts = VT.getVectorNumElements();
23676 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23678 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23679 !Index.getSimpleValueType().is512BitVector()) {
23680 // AVX512F supports only 512-bit vectors. Or data or index should
23681 // be 512 bit wide. If now the both index and data are 256-bit, but
23682 // the vector contains 8 elements, we just sign-extend the index
23683 if (NumElts == 8) {
23684 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23685 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23686 N->getOperand(3), Index };
23687 DAG.UpdateNodeOperands(N, Ops);
23691 // Minimal number of elements in Gather
23694 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23695 Index = ExtendToType(Index, NewIndexVT, DAG);
23696 if (IndexVT.getScalarType() == MVT::i32)
23697 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23700 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23701 // At this point we have promoted mask operand
23702 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23703 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23704 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23705 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23707 // The pass-through value
23708 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23709 Src0 = ExtendToType(Src0, NewVT, DAG);
23711 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23712 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23713 N->getMemoryVT(), dl, Ops,
23714 N->getMemOperand());
23715 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23716 NewGather.getValue(0),
23717 DAG.getIntPtrConstant(0, dl));
23718 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23719 return DAG.getMergeValues(RetOps, dl);
23724 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23725 SelectionDAG &DAG) const {
23726 // TODO: Eventually, the lowering of these nodes should be informed by or
23727 // deferred to the GC strategy for the function in which they appear. For
23728 // now, however, they must be lowered to something. Since they are logically
23729 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23730 // require special handling for these nodes), lower them as literal NOOPs for
23732 SmallVector<SDValue, 2> Ops;
23734 Ops.push_back(Op.getOperand(0));
23735 if (Op->getGluedNode())
23736 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23739 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23740 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23745 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23746 SelectionDAG &DAG) const {
23747 // TODO: Eventually, the lowering of these nodes should be informed by or
23748 // deferred to the GC strategy for the function in which they appear. For
23749 // now, however, they must be lowered to something. Since they are logically
23750 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23751 // require special handling for these nodes), lower them as literal NOOPs for
23753 SmallVector<SDValue, 2> Ops;
23755 Ops.push_back(Op.getOperand(0));
23756 if (Op->getGluedNode())
23757 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23760 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23761 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23766 /// Provide custom lowering hooks for some operations.
23767 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23768 switch (Op.getOpcode()) {
23769 default: llvm_unreachable("Should not custom lower this!");
23770 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23771 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23772 return LowerCMP_SWAP(Op, Subtarget, DAG);
23773 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23774 case ISD::ATOMIC_LOAD_ADD:
23775 case ISD::ATOMIC_LOAD_SUB:
23776 case ISD::ATOMIC_LOAD_OR:
23777 case ISD::ATOMIC_LOAD_XOR:
23778 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23779 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23780 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23781 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23782 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23783 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23784 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23785 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23786 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23787 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23788 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23789 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23790 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23791 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23792 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23793 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23794 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23795 case ISD::SHL_PARTS:
23796 case ISD::SRA_PARTS:
23797 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23798 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23799 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23800 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23801 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23802 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23803 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23804 case ISD::ZERO_EXTEND_VECTOR_INREG:
23805 case ISD::SIGN_EXTEND_VECTOR_INREG:
23806 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23807 case ISD::FP_TO_SINT:
23808 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
23809 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23810 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23812 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23813 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23814 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23815 case ISD::SETCC: return LowerSETCC(Op, DAG);
23816 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23817 case ISD::SELECT: return LowerSELECT(Op, DAG);
23818 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23819 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23820 case ISD::VASTART: return LowerVASTART(Op, DAG);
23821 case ISD::VAARG: return LowerVAARG(Op, DAG);
23822 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23823 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23824 case ISD::INTRINSIC_VOID:
23825 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23826 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23827 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23828 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23829 case ISD::FRAME_TO_ARGS_OFFSET:
23830 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23831 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23832 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23833 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23834 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23835 case ISD::EH_SJLJ_SETUP_DISPATCH:
23836 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23837 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23838 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23839 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23841 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23843 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23844 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23846 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23847 case ISD::UMUL_LOHI:
23848 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23849 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23852 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23858 case ISD::UMULO: return LowerXALUO(Op, DAG);
23859 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23860 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23864 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23866 case ISD::SUB: return LowerADD_SUB(Op, DAG);
23870 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23871 case ISD::ABS: return LowerABS(Op, DAG);
23872 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23873 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23874 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23875 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23876 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23877 case ISD::GC_TRANSITION_START:
23878 return LowerGC_TRANSITION_START(Op, DAG);
23879 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23880 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23884 /// Places new result values for the node in Results (their number
23885 /// and types must exactly match those of the original return values of
23886 /// the node), or leaves Results empty, which indicates that the node is not
23887 /// to be custom lowered after all.
23888 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23889 SmallVectorImpl<SDValue> &Results,
23890 SelectionDAG &DAG) const {
23891 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23893 if (!Res.getNode())
23896 assert((N->getNumValues() <= Res->getNumValues()) &&
23897 "Lowering returned the wrong number of results!");
23899 // Places new result values base on N result number.
23900 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23901 // than original node, chain should be dropped(last value).
23902 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23903 Results.push_back(Res.getValue(I));
23906 /// Replace a node with an illegal result type with a new node built out of
23908 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23909 SmallVectorImpl<SDValue>&Results,
23910 SelectionDAG &DAG) const {
23912 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23913 switch (N->getOpcode()) {
23915 llvm_unreachable("Do not know how to custom type legalize this operation!");
23916 case X86ISD::AVG: {
23917 // Legalize types for X86ISD::AVG by expanding vectors.
23918 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23920 auto InVT = N->getValueType(0);
23921 auto InVTSize = InVT.getSizeInBits();
23922 const unsigned RegSize =
23923 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23924 assert((Subtarget.hasBWI() || RegSize < 512) &&
23925 "512-bit vector requires AVX512BW");
23926 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23927 "256-bit vector requires AVX2");
23929 auto ElemVT = InVT.getVectorElementType();
23930 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23931 RegSize / ElemVT.getSizeInBits());
23932 assert(RegSize % InVT.getSizeInBits() == 0);
23933 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23935 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23936 Ops[0] = N->getOperand(0);
23937 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23938 Ops[0] = N->getOperand(1);
23939 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23941 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23942 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23943 DAG.getIntPtrConstant(0, dl)));
23946 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23947 case X86ISD::FMINC:
23949 case X86ISD::FMAXC:
23950 case X86ISD::FMAX: {
23951 EVT VT = N->getValueType(0);
23952 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23953 SDValue UNDEF = DAG.getUNDEF(VT);
23954 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23955 N->getOperand(0), UNDEF);
23956 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23957 N->getOperand(1), UNDEF);
23958 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23966 case ISD::UDIVREM: {
23967 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23968 Results.push_back(V);
23971 case ISD::FP_TO_SINT:
23972 case ISD::FP_TO_UINT: {
23973 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23975 if (N->getValueType(0) == MVT::v2i32) {
23976 assert((IsSigned || Subtarget.hasAVX512()) &&
23977 "Can only handle signed conversion without AVX512");
23978 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23979 SDValue Src = N->getOperand(0);
23980 if (Src.getValueType() == MVT::v2f64) {
23981 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23982 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23983 : X86ISD::CVTTP2UI,
23984 dl, MVT::v4i32, Src);
23985 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23986 Results.push_back(Res);
23989 if (Src.getValueType() == MVT::v2f32) {
23990 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23991 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23992 DAG.getUNDEF(MVT::v2f32));
23993 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23994 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23995 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23996 Results.push_back(Res);
24000 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24001 // so early out here.
24005 std::pair<SDValue,SDValue> Vals =
24006 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24007 SDValue FIST = Vals.first, StackSlot = Vals.second;
24008 if (FIST.getNode()) {
24009 EVT VT = N->getValueType(0);
24010 // Return a load from the stack slot.
24011 if (StackSlot.getNode())
24013 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24015 Results.push_back(FIST);
24019 case ISD::SINT_TO_FP: {
24020 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24021 SDValue Src = N->getOperand(0);
24022 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24024 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24027 case ISD::UINT_TO_FP: {
24028 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24029 EVT VT = N->getValueType(0);
24030 if (VT != MVT::v2f32)
24032 SDValue Src = N->getOperand(0);
24033 EVT SrcVT = Src.getValueType();
24034 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24035 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24038 if (SrcVT != MVT::v2i32)
24040 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24042 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24043 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24044 DAG.getBitcast(MVT::v2i64, VBias));
24045 Or = DAG.getBitcast(MVT::v2f64, Or);
24046 // TODO: Are there any fast-math-flags to propagate here?
24047 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24048 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24051 case ISD::FP_ROUND: {
24052 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24054 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24055 Results.push_back(V);
24058 case ISD::FP_EXTEND: {
24059 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24060 // No other ValueType for FP_EXTEND should reach this point.
24061 assert(N->getValueType(0) == MVT::v2f32 &&
24062 "Do not know how to legalize this Node");
24065 case ISD::INTRINSIC_W_CHAIN: {
24066 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24068 default : llvm_unreachable("Do not know how to custom type "
24069 "legalize this intrinsic operation!");
24070 case Intrinsic::x86_rdtsc:
24071 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24073 case Intrinsic::x86_rdtscp:
24074 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24076 case Intrinsic::x86_rdpmc:
24077 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24079 case Intrinsic::x86_xgetbv:
24080 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24083 case ISD::INTRINSIC_WO_CHAIN: {
24084 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24085 Results.push_back(V);
24088 case ISD::READCYCLECOUNTER: {
24089 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24092 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24093 EVT T = N->getValueType(0);
24094 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24095 bool Regs64bit = T == MVT::i128;
24096 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24097 SDValue cpInL, cpInH;
24098 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24099 DAG.getConstant(0, dl, HalfT));
24100 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24101 DAG.getConstant(1, dl, HalfT));
24102 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24103 Regs64bit ? X86::RAX : X86::EAX,
24105 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24106 Regs64bit ? X86::RDX : X86::EDX,
24107 cpInH, cpInL.getValue(1));
24108 SDValue swapInL, swapInH;
24109 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24110 DAG.getConstant(0, dl, HalfT));
24111 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24112 DAG.getConstant(1, dl, HalfT));
24114 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24115 swapInH, cpInH.getValue(1));
24116 // If the current function needs the base pointer, RBX,
24117 // we shouldn't use cmpxchg directly.
24118 // Indeed the lowering of that instruction will clobber
24119 // that register and since RBX will be a reserved register
24120 // the register allocator will not make sure its value will
24121 // be properly saved and restored around this live-range.
24122 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24124 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24125 unsigned BasePtr = TRI->getBaseRegister();
24126 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24127 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24128 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24129 // ISel prefers the LCMPXCHG64 variant.
24130 // If that assert breaks, that means it is not the case anymore,
24131 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24132 // not just EBX. This is a matter of accepting i64 input for that
24133 // pseudo, and restoring into the register of the right wide
24134 // in expand pseudo. Everything else should just work.
24135 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24136 "Saving only half of the RBX");
24137 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24138 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24139 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24140 Regs64bit ? X86::RBX : X86::EBX,
24141 HalfT, swapInH.getValue(1));
24142 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24144 /*Glue*/ RBXSave.getValue(2)};
24145 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24148 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24149 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24150 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24151 swapInH.getValue(1));
24152 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24153 swapInL.getValue(1)};
24154 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24156 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24157 Regs64bit ? X86::RAX : X86::EAX,
24158 HalfT, Result.getValue(1));
24159 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24160 Regs64bit ? X86::RDX : X86::EDX,
24161 HalfT, cpOutL.getValue(2));
24162 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24164 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24165 MVT::i32, cpOutH.getValue(2));
24166 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24167 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24169 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24170 Results.push_back(Success);
24171 Results.push_back(EFLAGS.getValue(1));
24174 case ISD::ATOMIC_SWAP:
24175 case ISD::ATOMIC_LOAD_ADD:
24176 case ISD::ATOMIC_LOAD_SUB:
24177 case ISD::ATOMIC_LOAD_AND:
24178 case ISD::ATOMIC_LOAD_OR:
24179 case ISD::ATOMIC_LOAD_XOR:
24180 case ISD::ATOMIC_LOAD_NAND:
24181 case ISD::ATOMIC_LOAD_MIN:
24182 case ISD::ATOMIC_LOAD_MAX:
24183 case ISD::ATOMIC_LOAD_UMIN:
24184 case ISD::ATOMIC_LOAD_UMAX:
24185 case ISD::ATOMIC_LOAD: {
24186 // Delegate to generic TypeLegalization. Situations we can really handle
24187 // should have already been dealt with by AtomicExpandPass.cpp.
24190 case ISD::BITCAST: {
24191 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24192 EVT DstVT = N->getValueType(0);
24193 EVT SrcVT = N->getOperand(0)->getValueType(0);
24195 if (SrcVT != MVT::f64 ||
24196 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24199 unsigned NumElts = DstVT.getVectorNumElements();
24200 EVT SVT = DstVT.getVectorElementType();
24201 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24202 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24203 MVT::v2f64, N->getOperand(0));
24204 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24206 if (ExperimentalVectorWideningLegalization) {
24207 // If we are legalizing vectors by widening, we already have the desired
24208 // legal vector type, just return it.
24209 Results.push_back(ToVecInt);
24213 SmallVector<SDValue, 8> Elts;
24214 for (unsigned i = 0, e = NumElts; i != e; ++i)
24215 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24216 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24218 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24223 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24224 switch ((X86ISD::NodeType)Opcode) {
24225 case X86ISD::FIRST_NUMBER: break;
24226 case X86ISD::BSF: return "X86ISD::BSF";
24227 case X86ISD::BSR: return "X86ISD::BSR";
24228 case X86ISD::SHLD: return "X86ISD::SHLD";
24229 case X86ISD::SHRD: return "X86ISD::SHRD";
24230 case X86ISD::FAND: return "X86ISD::FAND";
24231 case X86ISD::FANDN: return "X86ISD::FANDN";
24232 case X86ISD::FOR: return "X86ISD::FOR";
24233 case X86ISD::FXOR: return "X86ISD::FXOR";
24234 case X86ISD::FILD: return "X86ISD::FILD";
24235 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24236 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24237 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24238 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24239 case X86ISD::FLD: return "X86ISD::FLD";
24240 case X86ISD::FST: return "X86ISD::FST";
24241 case X86ISD::CALL: return "X86ISD::CALL";
24242 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24243 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24244 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24245 case X86ISD::BT: return "X86ISD::BT";
24246 case X86ISD::CMP: return "X86ISD::CMP";
24247 case X86ISD::COMI: return "X86ISD::COMI";
24248 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24249 case X86ISD::CMPM: return "X86ISD::CMPM";
24250 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24251 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24252 case X86ISD::SETCC: return "X86ISD::SETCC";
24253 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24254 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24255 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24256 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24257 case X86ISD::CMOV: return "X86ISD::CMOV";
24258 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24259 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24260 case X86ISD::IRET: return "X86ISD::IRET";
24261 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24262 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24263 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24264 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24265 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24266 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24267 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24268 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24269 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24270 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24271 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24272 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24273 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24274 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24275 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24276 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24277 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24278 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24279 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24280 case X86ISD::HADD: return "X86ISD::HADD";
24281 case X86ISD::HSUB: return "X86ISD::HSUB";
24282 case X86ISD::FHADD: return "X86ISD::FHADD";
24283 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24284 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24285 case X86ISD::FMAX: return "X86ISD::FMAX";
24286 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24287 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24288 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24289 case X86ISD::FMIN: return "X86ISD::FMIN";
24290 case X86ISD::FMINS: return "X86ISD::FMINS";
24291 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24292 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24293 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24294 case X86ISD::FMINC: return "X86ISD::FMINC";
24295 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24296 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24297 case X86ISD::FRCP: return "X86ISD::FRCP";
24298 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24299 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24300 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24301 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24302 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24303 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24304 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24305 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24306 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24307 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24308 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24309 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24310 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24311 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24312 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24313 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24314 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24315 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24316 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24317 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24318 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24319 case X86ISD::LADD: return "X86ISD::LADD";
24320 case X86ISD::LSUB: return "X86ISD::LSUB";
24321 case X86ISD::LOR: return "X86ISD::LOR";
24322 case X86ISD::LXOR: return "X86ISD::LXOR";
24323 case X86ISD::LAND: return "X86ISD::LAND";
24324 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24325 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24326 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24327 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24328 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24329 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24330 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24331 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24332 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24333 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24334 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24335 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24336 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24337 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24338 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24339 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24340 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24341 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24342 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24343 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24344 case X86ISD::VSHL: return "X86ISD::VSHL";
24345 case X86ISD::VSRL: return "X86ISD::VSRL";
24346 case X86ISD::VSRA: return "X86ISD::VSRA";
24347 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24348 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24349 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24350 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24351 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24352 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24353 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24354 case X86ISD::CMPP: return "X86ISD::CMPP";
24355 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24356 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24357 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24358 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24359 case X86ISD::ADD: return "X86ISD::ADD";
24360 case X86ISD::SUB: return "X86ISD::SUB";
24361 case X86ISD::ADC: return "X86ISD::ADC";
24362 case X86ISD::SBB: return "X86ISD::SBB";
24363 case X86ISD::SMUL: return "X86ISD::SMUL";
24364 case X86ISD::UMUL: return "X86ISD::UMUL";
24365 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24366 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24367 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24368 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24369 case X86ISD::INC: return "X86ISD::INC";
24370 case X86ISD::DEC: return "X86ISD::DEC";
24371 case X86ISD::OR: return "X86ISD::OR";
24372 case X86ISD::XOR: return "X86ISD::XOR";
24373 case X86ISD::AND: return "X86ISD::AND";
24374 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24375 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24376 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24377 case X86ISD::PTEST: return "X86ISD::PTEST";
24378 case X86ISD::TESTP: return "X86ISD::TESTP";
24379 case X86ISD::TESTM: return "X86ISD::TESTM";
24380 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24381 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24382 case X86ISD::KTEST: return "X86ISD::KTEST";
24383 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24384 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24385 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24386 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24387 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24388 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24389 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24390 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24391 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24392 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24393 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24394 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24395 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24396 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24397 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24398 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24399 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24400 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24401 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24402 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24403 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24404 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24405 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24406 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24407 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24408 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24409 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24410 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24411 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24412 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24413 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24414 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24415 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24416 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24417 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24418 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24419 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24420 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24421 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24422 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24423 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24424 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24425 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24426 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24427 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24428 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24429 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24430 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24431 case X86ISD::SAHF: return "X86ISD::SAHF";
24432 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24433 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24434 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24435 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24436 case X86ISD::VPROT: return "X86ISD::VPROT";
24437 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24438 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24439 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24440 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24441 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24442 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24443 case X86ISD::FMADD: return "X86ISD::FMADD";
24444 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24445 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24446 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24447 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24448 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24449 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24450 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24451 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24452 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24453 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24454 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24455 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24456 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24457 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24458 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24459 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24460 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24461 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24462 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24463 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24464 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24465 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24466 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24467 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24468 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24469 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24470 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24471 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24472 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24473 case X86ISD::XTEST: return "X86ISD::XTEST";
24474 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24475 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24476 case X86ISD::SELECT: return "X86ISD::SELECT";
24477 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24478 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24479 case X86ISD::RCP28: return "X86ISD::RCP28";
24480 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24481 case X86ISD::EXP2: return "X86ISD::EXP2";
24482 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24483 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24484 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24485 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24486 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24487 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24488 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24489 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24490 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24491 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24492 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24493 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24494 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24495 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24496 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24497 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24498 case X86ISD::ADDS: return "X86ISD::ADDS";
24499 case X86ISD::SUBS: return "X86ISD::SUBS";
24500 case X86ISD::AVG: return "X86ISD::AVG";
24501 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24502 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24503 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24504 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24505 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24506 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24507 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24508 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24509 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24510 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24511 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24512 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24513 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24514 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24515 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24516 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24517 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24518 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24519 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24520 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24521 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24522 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24523 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24524 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24529 /// Return true if the addressing mode represented by AM is legal for this
24530 /// target, for a load/store of the specified type.
24531 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24532 const AddrMode &AM, Type *Ty,
24533 unsigned AS) const {
24534 // X86 supports extremely general addressing modes.
24535 CodeModel::Model M = getTargetMachine().getCodeModel();
24537 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24538 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24542 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24544 // If a reference to this global requires an extra load, we can't fold it.
24545 if (isGlobalStubReference(GVFlags))
24548 // If BaseGV requires a register for the PIC base, we cannot also have a
24549 // BaseReg specified.
24550 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24553 // If lower 4G is not available, then we must use rip-relative addressing.
24554 if ((M != CodeModel::Small || isPositionIndependent()) &&
24555 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24559 switch (AM.Scale) {
24565 // These scales always work.
24570 // These scales are formed with basereg+scalereg. Only accept if there is
24575 default: // Other stuff never works.
24582 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24583 unsigned Bits = Ty->getScalarSizeInBits();
24585 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24586 // particularly cheaper than those without.
24590 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24591 // variable shifts just as cheap as scalar ones.
24592 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24595 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24596 // fully general vector.
24600 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24601 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24603 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24604 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24605 return NumBits1 > NumBits2;
24608 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24609 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24612 if (!isTypeLegal(EVT::getEVT(Ty1)))
24615 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24617 // Assuming the caller doesn't have a zeroext or signext return parameter,
24618 // truncation all the way down to i1 is valid.
24622 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24623 return isInt<32>(Imm);
24626 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24627 // Can also use sub to handle negated immediates.
24628 return isInt<32>(Imm);
24631 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24632 if (!VT1.isInteger() || !VT2.isInteger())
24634 unsigned NumBits1 = VT1.getSizeInBits();
24635 unsigned NumBits2 = VT2.getSizeInBits();
24636 return NumBits1 > NumBits2;
24639 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24640 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24641 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24644 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24645 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24646 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24649 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24650 EVT VT1 = Val.getValueType();
24651 if (isZExtFree(VT1, VT2))
24654 if (Val.getOpcode() != ISD::LOAD)
24657 if (!VT1.isSimple() || !VT1.isInteger() ||
24658 !VT2.isSimple() || !VT2.isInteger())
24661 switch (VT1.getSimpleVT().SimpleTy) {
24666 // X86 has 8, 16, and 32-bit zero-extending loads.
24673 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24676 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24677 if (!Subtarget.hasAnyFMA())
24680 VT = VT.getScalarType();
24682 if (!VT.isSimple())
24685 switch (VT.getSimpleVT().SimpleTy) {
24696 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24697 // i16 instructions are longer (0x66 prefix) and potentially slower.
24698 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24701 /// Targets can use this to indicate that they only support *some*
24702 /// VECTOR_SHUFFLE operations, those with specific masks.
24703 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24704 /// are assumed to be legal.
24706 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24708 if (!VT.isSimple())
24711 // Not for i1 vectors
24712 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24715 // Very little shuffling can be done for 64-bit vectors right now.
24716 if (VT.getSimpleVT().getSizeInBits() == 64)
24719 // We only care that the types being shuffled are legal. The lowering can
24720 // handle any possible shuffle mask that results.
24721 return isTypeLegal(VT.getSimpleVT());
24725 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24727 // Just delegate to the generic legality, clear masks aren't special.
24728 return isShuffleMaskLegal(Mask, VT);
24731 //===----------------------------------------------------------------------===//
24732 // X86 Scheduler Hooks
24733 //===----------------------------------------------------------------------===//
24735 /// Utility function to emit xbegin specifying the start of an RTM region.
24736 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24737 const TargetInstrInfo *TII) {
24738 DebugLoc DL = MI.getDebugLoc();
24740 const BasicBlock *BB = MBB->getBasicBlock();
24741 MachineFunction::iterator I = ++MBB->getIterator();
24743 // For the v = xbegin(), we generate
24754 MachineBasicBlock *thisMBB = MBB;
24755 MachineFunction *MF = MBB->getParent();
24756 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24757 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24758 MF->insert(I, mainMBB);
24759 MF->insert(I, sinkMBB);
24761 // Transfer the remainder of BB and its successor edges to sinkMBB.
24762 sinkMBB->splice(sinkMBB->begin(), MBB,
24763 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24764 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24768 // # fallthrough to mainMBB
24769 // # abortion to sinkMBB
24770 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
24771 thisMBB->addSuccessor(mainMBB);
24772 thisMBB->addSuccessor(sinkMBB);
24776 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
24777 mainMBB->addSuccessor(sinkMBB);
24780 // EAX is live into the sinkMBB
24781 sinkMBB->addLiveIn(X86::EAX);
24782 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24783 MI.getOperand(0).getReg())
24786 MI.eraseFromParent();
24790 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24791 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24792 // in the .td file.
24793 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24794 const TargetInstrInfo *TII) {
24796 switch (MI.getOpcode()) {
24797 default: llvm_unreachable("illegal opcode!");
24798 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24799 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24800 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24801 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24802 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24803 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24804 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24805 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24808 DebugLoc dl = MI.getDebugLoc();
24809 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24811 unsigned NumArgs = MI.getNumOperands();
24812 for (unsigned i = 1; i < NumArgs; ++i) {
24813 MachineOperand &Op = MI.getOperand(i);
24814 if (!(Op.isReg() && Op.isImplicit()))
24817 if (MI.hasOneMemOperand())
24818 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24820 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24821 .addReg(X86::XMM0);
24823 MI.eraseFromParent();
24827 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24828 // defs in an instruction pattern
24829 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24830 const TargetInstrInfo *TII) {
24832 switch (MI.getOpcode()) {
24833 default: llvm_unreachable("illegal opcode!");
24834 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24835 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24836 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24837 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24838 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24839 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24840 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24841 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24844 DebugLoc dl = MI.getDebugLoc();
24845 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24847 unsigned NumArgs = MI.getNumOperands(); // remove the results
24848 for (unsigned i = 1; i < NumArgs; ++i) {
24849 MachineOperand &Op = MI.getOperand(i);
24850 if (!(Op.isReg() && Op.isImplicit()))
24853 if (MI.hasOneMemOperand())
24854 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24856 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24859 MI.eraseFromParent();
24863 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24864 const X86Subtarget &Subtarget) {
24865 DebugLoc dl = MI.getDebugLoc();
24866 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24868 // insert input VAL into EAX
24869 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24870 .addReg(MI.getOperand(0).getReg());
24871 // insert zero to ECX
24872 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24874 // insert zero to EDX
24875 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24877 // insert WRPKRU instruction
24878 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24880 MI.eraseFromParent(); // The pseudo is gone now.
24884 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24885 const X86Subtarget &Subtarget) {
24886 DebugLoc dl = MI.getDebugLoc();
24887 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24889 // insert zero to ECX
24890 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24892 // insert RDPKRU instruction
24893 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24894 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24897 MI.eraseFromParent(); // The pseudo is gone now.
24901 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24902 const X86Subtarget &Subtarget,
24904 DebugLoc dl = MI.getDebugLoc();
24905 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24906 // Address into RAX/EAX, other two args into ECX, EDX.
24907 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24908 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24909 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24910 for (int i = 0; i < X86::AddrNumOperands; ++i)
24911 MIB.add(MI.getOperand(i));
24913 unsigned ValOps = X86::AddrNumOperands;
24914 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24915 .addReg(MI.getOperand(ValOps).getReg());
24916 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24917 .addReg(MI.getOperand(ValOps + 1).getReg());
24919 // The instruction doesn't actually take any operands though.
24920 BuildMI(*BB, MI, dl, TII->get(Opc));
24922 MI.eraseFromParent(); // The pseudo is gone now.
24926 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24927 const X86Subtarget &Subtarget) {
24928 DebugLoc dl = MI->getDebugLoc();
24929 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24930 // Address into RAX/EAX
24931 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24932 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24933 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24934 for (int i = 0; i < X86::AddrNumOperands; ++i)
24935 MIB.add(MI->getOperand(i));
24937 // The instruction doesn't actually take any operands though.
24938 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24940 MI->eraseFromParent(); // The pseudo is gone now.
24946 MachineBasicBlock *
24947 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24948 MachineBasicBlock *MBB) const {
24949 // Emit va_arg instruction on X86-64.
24951 // Operands to this pseudo-instruction:
24952 // 0 ) Output : destination address (reg)
24953 // 1-5) Input : va_list address (addr, i64mem)
24954 // 6 ) ArgSize : Size (in bytes) of vararg type
24955 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24956 // 8 ) Align : Alignment of type
24957 // 9 ) EFLAGS (implicit-def)
24959 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24960 static_assert(X86::AddrNumOperands == 5,
24961 "VAARG_64 assumes 5 address operands");
24963 unsigned DestReg = MI.getOperand(0).getReg();
24964 MachineOperand &Base = MI.getOperand(1);
24965 MachineOperand &Scale = MI.getOperand(2);
24966 MachineOperand &Index = MI.getOperand(3);
24967 MachineOperand &Disp = MI.getOperand(4);
24968 MachineOperand &Segment = MI.getOperand(5);
24969 unsigned ArgSize = MI.getOperand(6).getImm();
24970 unsigned ArgMode = MI.getOperand(7).getImm();
24971 unsigned Align = MI.getOperand(8).getImm();
24973 // Memory Reference
24974 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24975 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24976 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24978 // Machine Information
24979 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24980 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24981 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24982 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24983 DebugLoc DL = MI.getDebugLoc();
24985 // struct va_list {
24988 // i64 overflow_area (address)
24989 // i64 reg_save_area (address)
24991 // sizeof(va_list) = 24
24992 // alignment(va_list) = 8
24994 unsigned TotalNumIntRegs = 6;
24995 unsigned TotalNumXMMRegs = 8;
24996 bool UseGPOffset = (ArgMode == 1);
24997 bool UseFPOffset = (ArgMode == 2);
24998 unsigned MaxOffset = TotalNumIntRegs * 8 +
24999 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25001 /* Align ArgSize to a multiple of 8 */
25002 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25003 bool NeedsAlign = (Align > 8);
25005 MachineBasicBlock *thisMBB = MBB;
25006 MachineBasicBlock *overflowMBB;
25007 MachineBasicBlock *offsetMBB;
25008 MachineBasicBlock *endMBB;
25010 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25011 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25012 unsigned OffsetReg = 0;
25014 if (!UseGPOffset && !UseFPOffset) {
25015 // If we only pull from the overflow region, we don't create a branch.
25016 // We don't need to alter control flow.
25017 OffsetDestReg = 0; // unused
25018 OverflowDestReg = DestReg;
25020 offsetMBB = nullptr;
25021 overflowMBB = thisMBB;
25024 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25025 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25026 // If not, pull from overflow_area. (branch to overflowMBB)
25031 // offsetMBB overflowMBB
25036 // Registers for the PHI in endMBB
25037 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25038 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25040 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25041 MachineFunction *MF = MBB->getParent();
25042 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25043 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25044 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25046 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25048 // Insert the new basic blocks
25049 MF->insert(MBBIter, offsetMBB);
25050 MF->insert(MBBIter, overflowMBB);
25051 MF->insert(MBBIter, endMBB);
25053 // Transfer the remainder of MBB and its successor edges to endMBB.
25054 endMBB->splice(endMBB->begin(), thisMBB,
25055 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25056 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25058 // Make offsetMBB and overflowMBB successors of thisMBB
25059 thisMBB->addSuccessor(offsetMBB);
25060 thisMBB->addSuccessor(overflowMBB);
25062 // endMBB is a successor of both offsetMBB and overflowMBB
25063 offsetMBB->addSuccessor(endMBB);
25064 overflowMBB->addSuccessor(endMBB);
25066 // Load the offset value into a register
25067 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25068 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25072 .addDisp(Disp, UseFPOffset ? 4 : 0)
25074 .setMemRefs(MMOBegin, MMOEnd);
25076 // Check if there is enough room left to pull this argument.
25077 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25079 .addImm(MaxOffset + 8 - ArgSizeA8);
25081 // Branch to "overflowMBB" if offset >= max
25082 // Fall through to "offsetMBB" otherwise
25083 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25084 .addMBB(overflowMBB);
25087 // In offsetMBB, emit code to use the reg_save_area.
25089 assert(OffsetReg != 0);
25091 // Read the reg_save_area address.
25092 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25093 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25099 .setMemRefs(MMOBegin, MMOEnd);
25101 // Zero-extend the offset
25102 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25103 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25106 .addImm(X86::sub_32bit);
25108 // Add the offset to the reg_save_area to get the final address.
25109 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25110 .addReg(OffsetReg64)
25111 .addReg(RegSaveReg);
25113 // Compute the offset for the next argument
25114 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25115 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25117 .addImm(UseFPOffset ? 16 : 8);
25119 // Store it back into the va_list.
25120 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25124 .addDisp(Disp, UseFPOffset ? 4 : 0)
25126 .addReg(NextOffsetReg)
25127 .setMemRefs(MMOBegin, MMOEnd);
25130 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25135 // Emit code to use overflow area
25138 // Load the overflow_area address into a register.
25139 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25140 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25146 .setMemRefs(MMOBegin, MMOEnd);
25148 // If we need to align it, do so. Otherwise, just copy the address
25149 // to OverflowDestReg.
25151 // Align the overflow address
25152 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25153 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25155 // aligned_addr = (addr + (align-1)) & ~(align-1)
25156 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25157 .addReg(OverflowAddrReg)
25160 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25162 .addImm(~(uint64_t)(Align-1));
25164 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25165 .addReg(OverflowAddrReg);
25168 // Compute the next overflow address after this argument.
25169 // (the overflow address should be kept 8-byte aligned)
25170 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25171 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25172 .addReg(OverflowDestReg)
25173 .addImm(ArgSizeA8);
25175 // Store the new overflow address.
25176 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25182 .addReg(NextAddrReg)
25183 .setMemRefs(MMOBegin, MMOEnd);
25185 // If we branched, emit the PHI to the front of endMBB.
25187 BuildMI(*endMBB, endMBB->begin(), DL,
25188 TII->get(X86::PHI), DestReg)
25189 .addReg(OffsetDestReg).addMBB(offsetMBB)
25190 .addReg(OverflowDestReg).addMBB(overflowMBB);
25193 // Erase the pseudo instruction
25194 MI.eraseFromParent();
25199 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25200 MachineInstr &MI, MachineBasicBlock *MBB) const {
25201 // Emit code to save XMM registers to the stack. The ABI says that the
25202 // number of registers to save is given in %al, so it's theoretically
25203 // possible to do an indirect jump trick to avoid saving all of them,
25204 // however this code takes a simpler approach and just executes all
25205 // of the stores if %al is non-zero. It's less code, and it's probably
25206 // easier on the hardware branch predictor, and stores aren't all that
25207 // expensive anyway.
25209 // Create the new basic blocks. One block contains all the XMM stores,
25210 // and one block is the final destination regardless of whether any
25211 // stores were performed.
25212 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25213 MachineFunction *F = MBB->getParent();
25214 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25215 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25216 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25217 F->insert(MBBIter, XMMSaveMBB);
25218 F->insert(MBBIter, EndMBB);
25220 // Transfer the remainder of MBB and its successor edges to EndMBB.
25221 EndMBB->splice(EndMBB->begin(), MBB,
25222 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25223 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25225 // The original block will now fall through to the XMM save block.
25226 MBB->addSuccessor(XMMSaveMBB);
25227 // The XMMSaveMBB will fall through to the end block.
25228 XMMSaveMBB->addSuccessor(EndMBB);
25230 // Now add the instructions.
25231 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25232 DebugLoc DL = MI.getDebugLoc();
25234 unsigned CountReg = MI.getOperand(0).getReg();
25235 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25236 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25238 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25239 // If %al is 0, branch around the XMM save block.
25240 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25241 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25242 MBB->addSuccessor(EndMBB);
25245 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25246 // that was just emitted, but clearly shouldn't be "saved".
25247 assert((MI.getNumOperands() <= 3 ||
25248 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25249 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25250 "Expected last argument to be EFLAGS");
25251 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25252 // In the XMM save block, save all the XMM argument registers.
25253 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25254 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25255 MachineMemOperand *MMO = F->getMachineMemOperand(
25256 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25257 MachineMemOperand::MOStore,
25258 /*Size=*/16, /*Align=*/16);
25259 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25260 .addFrameIndex(RegSaveFrameIndex)
25261 .addImm(/*Scale=*/1)
25262 .addReg(/*IndexReg=*/0)
25263 .addImm(/*Disp=*/Offset)
25264 .addReg(/*Segment=*/0)
25265 .addReg(MI.getOperand(i).getReg())
25266 .addMemOperand(MMO);
25269 MI.eraseFromParent(); // The pseudo instruction is gone now.
25274 // The EFLAGS operand of SelectItr might be missing a kill marker
25275 // because there were multiple uses of EFLAGS, and ISel didn't know
25276 // which to mark. Figure out whether SelectItr should have had a
25277 // kill marker, and set it if it should. Returns the correct kill
25279 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25280 MachineBasicBlock* BB,
25281 const TargetRegisterInfo* TRI) {
25282 // Scan forward through BB for a use/def of EFLAGS.
25283 MachineBasicBlock::iterator miI(std::next(SelectItr));
25284 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25285 const MachineInstr& mi = *miI;
25286 if (mi.readsRegister(X86::EFLAGS))
25288 if (mi.definesRegister(X86::EFLAGS))
25289 break; // Should have kill-flag - update below.
25292 // If we hit the end of the block, check whether EFLAGS is live into a
25294 if (miI == BB->end()) {
25295 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25296 sEnd = BB->succ_end();
25297 sItr != sEnd; ++sItr) {
25298 MachineBasicBlock* succ = *sItr;
25299 if (succ->isLiveIn(X86::EFLAGS))
25304 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25305 // out. SelectMI should have a kill flag on EFLAGS.
25306 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25310 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25311 // together with other CMOV pseudo-opcodes into a single basic-block with
25312 // conditional jump around it.
25313 static bool isCMOVPseudo(MachineInstr &MI) {
25314 switch (MI.getOpcode()) {
25315 case X86::CMOV_FR32:
25316 case X86::CMOV_FR64:
25317 case X86::CMOV_GR8:
25318 case X86::CMOV_GR16:
25319 case X86::CMOV_GR32:
25320 case X86::CMOV_RFP32:
25321 case X86::CMOV_RFP64:
25322 case X86::CMOV_RFP80:
25323 case X86::CMOV_V2F64:
25324 case X86::CMOV_V2I64:
25325 case X86::CMOV_V4F32:
25326 case X86::CMOV_V4F64:
25327 case X86::CMOV_V4I64:
25328 case X86::CMOV_V16F32:
25329 case X86::CMOV_V8F32:
25330 case X86::CMOV_V8F64:
25331 case X86::CMOV_V8I64:
25332 case X86::CMOV_V8I1:
25333 case X86::CMOV_V16I1:
25334 case X86::CMOV_V32I1:
25335 case X86::CMOV_V64I1:
25343 MachineBasicBlock *
25344 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25345 MachineBasicBlock *BB) const {
25346 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25347 DebugLoc DL = MI.getDebugLoc();
25349 // To "insert" a SELECT_CC instruction, we actually have to insert the
25350 // diamond control-flow pattern. The incoming instruction knows the
25351 // destination vreg to set, the condition code register to branch on, the
25352 // true/false values to select between, and a branch opcode to use.
25353 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25354 MachineFunction::iterator It = ++BB->getIterator();
25359 // cmpTY ccX, r1, r2
25361 // fallthrough --> copy0MBB
25362 MachineBasicBlock *thisMBB = BB;
25363 MachineFunction *F = BB->getParent();
25365 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25366 // as described above, by inserting a BB, and then making a PHI at the join
25367 // point to select the true and false operands of the CMOV in the PHI.
25369 // The code also handles two different cases of multiple CMOV opcodes
25373 // In this case, there are multiple CMOVs in a row, all which are based on
25374 // the same condition setting (or the exact opposite condition setting).
25375 // In this case we can lower all the CMOVs using a single inserted BB, and
25376 // then make a number of PHIs at the join point to model the CMOVs. The only
25377 // trickiness here, is that in a case like:
25379 // t2 = CMOV cond1 t1, f1
25380 // t3 = CMOV cond1 t2, f2
25382 // when rewriting this into PHIs, we have to perform some renaming on the
25383 // temps since you cannot have a PHI operand refer to a PHI result earlier
25384 // in the same block. The "simple" but wrong lowering would be:
25386 // t2 = PHI t1(BB1), f1(BB2)
25387 // t3 = PHI t2(BB1), f2(BB2)
25389 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25390 // renaming is to note that on the path through BB1, t2 is really just a
25391 // copy of t1, and do that renaming, properly generating:
25393 // t2 = PHI t1(BB1), f1(BB2)
25394 // t3 = PHI t1(BB1), f2(BB2)
25396 // Case 2, we lower cascaded CMOVs such as
25398 // (CMOV (CMOV F, T, cc1), T, cc2)
25400 // to two successive branches. For that, we look for another CMOV as the
25401 // following instruction.
25403 // Without this, we would add a PHI between the two jumps, which ends up
25404 // creating a few copies all around. For instance, for
25406 // (sitofp (zext (fcmp une)))
25408 // we would generate:
25410 // ucomiss %xmm1, %xmm0
25411 // movss <1.0f>, %xmm0
25412 // movaps %xmm0, %xmm1
25414 // xorps %xmm1, %xmm1
25417 // movaps %xmm1, %xmm0
25421 // because this custom-inserter would have generated:
25433 // A: X = ...; Y = ...
25435 // C: Z = PHI [X, A], [Y, B]
25437 // E: PHI [X, C], [Z, D]
25439 // If we lower both CMOVs in a single step, we can instead generate:
25451 // A: X = ...; Y = ...
25453 // E: PHI [X, A], [X, C], [Y, D]
25455 // Which, in our sitofp/fcmp example, gives us something like:
25457 // ucomiss %xmm1, %xmm0
25458 // movss <1.0f>, %xmm0
25461 // xorps %xmm0, %xmm0
25465 MachineInstr *CascadedCMOV = nullptr;
25466 MachineInstr *LastCMOV = &MI;
25467 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25468 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25469 MachineBasicBlock::iterator NextMIIt =
25470 std::next(MachineBasicBlock::iterator(MI));
25472 // Check for case 1, where there are multiple CMOVs with the same condition
25473 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25474 // number of jumps the most.
25476 if (isCMOVPseudo(MI)) {
25477 // See if we have a string of CMOVS with the same condition.
25478 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25479 (NextMIIt->getOperand(3).getImm() == CC ||
25480 NextMIIt->getOperand(3).getImm() == OppCC)) {
25481 LastCMOV = &*NextMIIt;
25486 // This checks for case 2, but only do this if we didn't already find
25487 // case 1, as indicated by LastCMOV == MI.
25488 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25489 NextMIIt->getOpcode() == MI.getOpcode() &&
25490 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25491 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25492 NextMIIt->getOperand(1).isKill()) {
25493 CascadedCMOV = &*NextMIIt;
25496 MachineBasicBlock *jcc1MBB = nullptr;
25498 // If we have a cascaded CMOV, we lower it to two successive branches to
25499 // the same block. EFLAGS is used by both, so mark it as live in the second.
25500 if (CascadedCMOV) {
25501 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25502 F->insert(It, jcc1MBB);
25503 jcc1MBB->addLiveIn(X86::EFLAGS);
25506 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25507 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25508 F->insert(It, copy0MBB);
25509 F->insert(It, sinkMBB);
25511 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25512 // live into the sink and copy blocks.
25513 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25515 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25516 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25517 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25518 copy0MBB->addLiveIn(X86::EFLAGS);
25519 sinkMBB->addLiveIn(X86::EFLAGS);
25522 // Transfer the remainder of BB and its successor edges to sinkMBB.
25523 sinkMBB->splice(sinkMBB->begin(), BB,
25524 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25525 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25527 // Add the true and fallthrough blocks as its successors.
25528 if (CascadedCMOV) {
25529 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25530 BB->addSuccessor(jcc1MBB);
25532 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25533 // jump to the sinkMBB.
25534 jcc1MBB->addSuccessor(copy0MBB);
25535 jcc1MBB->addSuccessor(sinkMBB);
25537 BB->addSuccessor(copy0MBB);
25540 // The true block target of the first (or only) branch is always sinkMBB.
25541 BB->addSuccessor(sinkMBB);
25543 // Create the conditional branch instruction.
25544 unsigned Opc = X86::GetCondBranchFromCond(CC);
25545 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25547 if (CascadedCMOV) {
25548 unsigned Opc2 = X86::GetCondBranchFromCond(
25549 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25550 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25554 // %FalseValue = ...
25555 // # fallthrough to sinkMBB
25556 copy0MBB->addSuccessor(sinkMBB);
25559 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25561 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25562 MachineBasicBlock::iterator MIItEnd =
25563 std::next(MachineBasicBlock::iterator(LastCMOV));
25564 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25565 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25566 MachineInstrBuilder MIB;
25568 // As we are creating the PHIs, we have to be careful if there is more than
25569 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25570 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25571 // That also means that PHI construction must work forward from earlier to
25572 // later, and that the code must maintain a mapping from earlier PHI's
25573 // destination registers, and the registers that went into the PHI.
25575 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25576 unsigned DestReg = MIIt->getOperand(0).getReg();
25577 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25578 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25580 // If this CMOV we are generating is the opposite condition from
25581 // the jump we generated, then we have to swap the operands for the
25582 // PHI that is going to be generated.
25583 if (MIIt->getOperand(3).getImm() == OppCC)
25584 std::swap(Op1Reg, Op2Reg);
25586 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25587 Op1Reg = RegRewriteTable[Op1Reg].first;
25589 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25590 Op2Reg = RegRewriteTable[Op2Reg].second;
25592 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25593 TII->get(X86::PHI), DestReg)
25594 .addReg(Op1Reg).addMBB(copy0MBB)
25595 .addReg(Op2Reg).addMBB(thisMBB);
25597 // Add this PHI to the rewrite table.
25598 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25601 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25602 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25603 if (CascadedCMOV) {
25604 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25605 // Copy the PHI result to the register defined by the second CMOV.
25606 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25607 DL, TII->get(TargetOpcode::COPY),
25608 CascadedCMOV->getOperand(0).getReg())
25609 .addReg(MI.getOperand(0).getReg());
25610 CascadedCMOV->eraseFromParent();
25613 // Now remove the CMOV(s).
25614 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25615 (MIIt++)->eraseFromParent();
25620 MachineBasicBlock *
25621 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25622 MachineBasicBlock *BB) const {
25623 // Combine the following atomic floating-point modification pattern:
25624 // a.store(reg OP a.load(acquire), release)
25625 // Transform them into:
25626 // OPss (%gpr), %xmm
25627 // movss %xmm, (%gpr)
25628 // Or sd equivalent for 64-bit operations.
25630 switch (MI.getOpcode()) {
25631 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25632 case X86::RELEASE_FADD32mr:
25633 FOp = X86::ADDSSrm;
25634 MOp = X86::MOVSSmr;
25636 case X86::RELEASE_FADD64mr:
25637 FOp = X86::ADDSDrm;
25638 MOp = X86::MOVSDmr;
25641 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25642 DebugLoc DL = MI.getDebugLoc();
25643 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25644 unsigned ValOpIdx = X86::AddrNumOperands;
25645 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25646 MachineInstrBuilder MIB =
25647 BuildMI(*BB, MI, DL, TII->get(FOp),
25648 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25650 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25651 MachineOperand &Operand = MI.getOperand(i);
25652 // Clear any kill flags on register operands as we'll create a second
25653 // instruction using the same address operands.
25654 if (Operand.isReg())
25655 Operand.setIsKill(false);
25658 MachineInstr *FOpMI = MIB;
25659 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25660 for (int i = 0; i < X86::AddrNumOperands; ++i)
25661 MIB.add(MI.getOperand(i));
25662 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25663 MI.eraseFromParent(); // The pseudo instruction is gone now.
25667 MachineBasicBlock *
25668 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25669 MachineBasicBlock *BB) const {
25670 MachineFunction *MF = BB->getParent();
25671 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25672 DebugLoc DL = MI.getDebugLoc();
25673 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25675 assert(MF->shouldSplitStack());
25677 const bool Is64Bit = Subtarget.is64Bit();
25678 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25680 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25681 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25684 // ... [Till the alloca]
25685 // If stacklet is not large enough, jump to mallocMBB
25688 // Allocate by subtracting from RSP
25689 // Jump to continueMBB
25692 // Allocate by call to runtime
25696 // [rest of original BB]
25699 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25700 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25701 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25703 MachineRegisterInfo &MRI = MF->getRegInfo();
25704 const TargetRegisterClass *AddrRegClass =
25705 getRegClassFor(getPointerTy(MF->getDataLayout()));
25707 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25708 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25709 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25710 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25711 sizeVReg = MI.getOperand(1).getReg(),
25713 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25715 MachineFunction::iterator MBBIter = ++BB->getIterator();
25717 MF->insert(MBBIter, bumpMBB);
25718 MF->insert(MBBIter, mallocMBB);
25719 MF->insert(MBBIter, continueMBB);
25721 continueMBB->splice(continueMBB->begin(), BB,
25722 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25723 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25725 // Add code to the main basic block to check if the stack limit has been hit,
25726 // and if so, jump to mallocMBB otherwise to bumpMBB.
25727 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25728 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25729 .addReg(tmpSPVReg).addReg(sizeVReg);
25730 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25731 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25732 .addReg(SPLimitVReg);
25733 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25735 // bumpMBB simply decreases the stack pointer, since we know the current
25736 // stacklet has enough space.
25737 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25738 .addReg(SPLimitVReg);
25739 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25740 .addReg(SPLimitVReg);
25741 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25743 // Calls into a routine in libgcc to allocate more space from the heap.
25744 const uint32_t *RegMask =
25745 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25747 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25749 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25750 .addExternalSymbol("__morestack_allocate_stack_space")
25751 .addRegMask(RegMask)
25752 .addReg(X86::RDI, RegState::Implicit)
25753 .addReg(X86::RAX, RegState::ImplicitDefine);
25754 } else if (Is64Bit) {
25755 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25757 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25758 .addExternalSymbol("__morestack_allocate_stack_space")
25759 .addRegMask(RegMask)
25760 .addReg(X86::EDI, RegState::Implicit)
25761 .addReg(X86::EAX, RegState::ImplicitDefine);
25763 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25765 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25766 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25767 .addExternalSymbol("__morestack_allocate_stack_space")
25768 .addRegMask(RegMask)
25769 .addReg(X86::EAX, RegState::ImplicitDefine);
25773 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25776 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25777 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25778 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25780 // Set up the CFG correctly.
25781 BB->addSuccessor(bumpMBB);
25782 BB->addSuccessor(mallocMBB);
25783 mallocMBB->addSuccessor(continueMBB);
25784 bumpMBB->addSuccessor(continueMBB);
25786 // Take care of the PHI nodes.
25787 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25788 MI.getOperand(0).getReg())
25789 .addReg(mallocPtrVReg)
25791 .addReg(bumpSPPtrVReg)
25794 // Delete the original pseudo instruction.
25795 MI.eraseFromParent();
25798 return continueMBB;
25801 MachineBasicBlock *
25802 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25803 MachineBasicBlock *BB) const {
25804 MachineFunction *MF = BB->getParent();
25805 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25806 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25807 DebugLoc DL = MI.getDebugLoc();
25809 assert(!isAsynchronousEHPersonality(
25810 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25811 "SEH does not use catchret!");
25813 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25814 if (!Subtarget.is32Bit())
25817 // C++ EH creates a new target block to hold the restore code, and wires up
25818 // the new block to the return destination with a normal JMP_4.
25819 MachineBasicBlock *RestoreMBB =
25820 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25821 assert(BB->succ_size() == 1);
25822 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25823 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25824 BB->addSuccessor(RestoreMBB);
25825 MI.getOperand(0).setMBB(RestoreMBB);
25827 auto RestoreMBBI = RestoreMBB->begin();
25828 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25829 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25833 MachineBasicBlock *
25834 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25835 MachineBasicBlock *BB) const {
25836 MachineFunction *MF = BB->getParent();
25837 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25838 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25839 // Only 32-bit SEH requires special handling for catchpad.
25840 if (IsSEH && Subtarget.is32Bit()) {
25841 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25842 DebugLoc DL = MI.getDebugLoc();
25843 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25845 MI.eraseFromParent();
25849 MachineBasicBlock *
25850 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25851 MachineBasicBlock *BB) const {
25852 // So, here we replace TLSADDR with the sequence:
25853 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25854 // We need this because TLSADDR is lowered into calls
25855 // inside MC, therefore without the two markers shrink-wrapping
25856 // may push the prologue/epilogue pass them.
25857 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25858 DebugLoc DL = MI.getDebugLoc();
25859 MachineFunction &MF = *BB->getParent();
25861 // Emit CALLSEQ_START right before the instruction.
25862 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25863 MachineInstrBuilder CallseqStart =
25864 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25865 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25867 // Emit CALLSEQ_END right after the instruction.
25868 // We don't call erase from parent because we want to keep the
25869 // original instruction around.
25870 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25871 MachineInstrBuilder CallseqEnd =
25872 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25873 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25878 MachineBasicBlock *
25879 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25880 MachineBasicBlock *BB) const {
25881 // This is pretty easy. We're taking the value that we received from
25882 // our load from the relocation, sticking it in either RDI (x86-64)
25883 // or EAX and doing an indirect call. The return value will then
25884 // be in the normal return register.
25885 MachineFunction *F = BB->getParent();
25886 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25887 DebugLoc DL = MI.getDebugLoc();
25889 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25890 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25892 // Get a register mask for the lowered call.
25893 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25894 // proper register mask.
25895 const uint32_t *RegMask =
25896 Subtarget.is64Bit() ?
25897 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25898 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25899 if (Subtarget.is64Bit()) {
25900 MachineInstrBuilder MIB =
25901 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25905 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25906 MI.getOperand(3).getTargetFlags())
25908 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25909 addDirectMem(MIB, X86::RDI);
25910 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25911 } else if (!isPositionIndependent()) {
25912 MachineInstrBuilder MIB =
25913 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25917 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25918 MI.getOperand(3).getTargetFlags())
25920 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25921 addDirectMem(MIB, X86::EAX);
25922 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25924 MachineInstrBuilder MIB =
25925 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25926 .addReg(TII->getGlobalBaseReg(F))
25929 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25930 MI.getOperand(3).getTargetFlags())
25932 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25933 addDirectMem(MIB, X86::EAX);
25934 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25937 MI.eraseFromParent(); // The pseudo instruction is gone now.
25941 MachineBasicBlock *
25942 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25943 MachineBasicBlock *MBB) const {
25944 DebugLoc DL = MI.getDebugLoc();
25945 MachineFunction *MF = MBB->getParent();
25946 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25947 MachineRegisterInfo &MRI = MF->getRegInfo();
25949 const BasicBlock *BB = MBB->getBasicBlock();
25950 MachineFunction::iterator I = ++MBB->getIterator();
25952 // Memory Reference
25953 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25954 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25957 unsigned MemOpndSlot = 0;
25959 unsigned CurOp = 0;
25961 DstReg = MI.getOperand(CurOp++).getReg();
25962 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25963 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25964 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25965 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25967 MemOpndSlot = CurOp;
25969 MVT PVT = getPointerTy(MF->getDataLayout());
25970 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25971 "Invalid Pointer Size!");
25973 // For v = setjmp(buf), we generate
25976 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25977 // SjLjSetup restoreMBB
25983 // v = phi(main, restore)
25986 // if base pointer being used, load it from frame
25989 MachineBasicBlock *thisMBB = MBB;
25990 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25991 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25992 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25993 MF->insert(I, mainMBB);
25994 MF->insert(I, sinkMBB);
25995 MF->push_back(restoreMBB);
25996 restoreMBB->setHasAddressTaken();
25998 MachineInstrBuilder MIB;
26000 // Transfer the remainder of BB and its successor edges to sinkMBB.
26001 sinkMBB->splice(sinkMBB->begin(), MBB,
26002 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26003 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26006 unsigned PtrStoreOpc = 0;
26007 unsigned LabelReg = 0;
26008 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26009 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26010 !isPositionIndependent();
26012 // Prepare IP either in reg or imm.
26013 if (!UseImmLabel) {
26014 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26015 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26016 LabelReg = MRI.createVirtualRegister(PtrRC);
26017 if (Subtarget.is64Bit()) {
26018 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26022 .addMBB(restoreMBB)
26025 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26026 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26027 .addReg(XII->getGlobalBaseReg(MF))
26030 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26034 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26036 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26037 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26038 if (i == X86::AddrDisp)
26039 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26041 MIB.add(MI.getOperand(MemOpndSlot + i));
26044 MIB.addReg(LabelReg);
26046 MIB.addMBB(restoreMBB);
26047 MIB.setMemRefs(MMOBegin, MMOEnd);
26049 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26050 .addMBB(restoreMBB);
26052 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26053 MIB.addRegMask(RegInfo->getNoPreservedMask());
26054 thisMBB->addSuccessor(mainMBB);
26055 thisMBB->addSuccessor(restoreMBB);
26059 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26060 mainMBB->addSuccessor(sinkMBB);
26063 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26064 TII->get(X86::PHI), DstReg)
26065 .addReg(mainDstReg).addMBB(mainMBB)
26066 .addReg(restoreDstReg).addMBB(restoreMBB);
26069 if (RegInfo->hasBasePointer(*MF)) {
26070 const bool Uses64BitFramePtr =
26071 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26072 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26073 X86FI->setRestoreBasePointer(MF);
26074 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26075 unsigned BasePtr = RegInfo->getBaseRegister();
26076 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26077 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26078 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26079 .setMIFlag(MachineInstr::FrameSetup);
26081 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26082 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26083 restoreMBB->addSuccessor(sinkMBB);
26085 MI.eraseFromParent();
26089 MachineBasicBlock *
26090 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26091 MachineBasicBlock *MBB) const {
26092 DebugLoc DL = MI.getDebugLoc();
26093 MachineFunction *MF = MBB->getParent();
26094 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26095 MachineRegisterInfo &MRI = MF->getRegInfo();
26097 // Memory Reference
26098 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26099 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26101 MVT PVT = getPointerTy(MF->getDataLayout());
26102 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26103 "Invalid Pointer Size!");
26105 const TargetRegisterClass *RC =
26106 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26107 unsigned Tmp = MRI.createVirtualRegister(RC);
26108 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26109 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26110 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26111 unsigned SP = RegInfo->getStackRegister();
26113 MachineInstrBuilder MIB;
26115 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26116 const int64_t SPOffset = 2 * PVT.getStoreSize();
26118 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26119 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26122 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26123 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26124 MIB.add(MI.getOperand(i));
26125 MIB.setMemRefs(MMOBegin, MMOEnd);
26127 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26128 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26129 if (i == X86::AddrDisp)
26130 MIB.addDisp(MI.getOperand(i), LabelOffset);
26132 MIB.add(MI.getOperand(i));
26134 MIB.setMemRefs(MMOBegin, MMOEnd);
26136 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26137 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26138 if (i == X86::AddrDisp)
26139 MIB.addDisp(MI.getOperand(i), SPOffset);
26141 MIB.add(MI.getOperand(i));
26143 MIB.setMemRefs(MMOBegin, MMOEnd);
26145 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26147 MI.eraseFromParent();
26151 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26152 MachineBasicBlock *MBB,
26153 MachineBasicBlock *DispatchBB,
26155 DebugLoc DL = MI.getDebugLoc();
26156 MachineFunction *MF = MBB->getParent();
26157 MachineRegisterInfo *MRI = &MF->getRegInfo();
26158 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26160 MVT PVT = getPointerTy(MF->getDataLayout());
26161 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26166 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26167 !isPositionIndependent();
26170 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26172 const TargetRegisterClass *TRC =
26173 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26174 VR = MRI->createVirtualRegister(TRC);
26175 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26177 if (Subtarget.is64Bit())
26178 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26182 .addMBB(DispatchBB)
26185 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26186 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26189 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26193 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26194 addFrameReference(MIB, FI, 36);
26196 MIB.addMBB(DispatchBB);
26201 MachineBasicBlock *
26202 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26203 MachineBasicBlock *BB) const {
26204 DebugLoc DL = MI.getDebugLoc();
26205 MachineFunction *MF = BB->getParent();
26206 MachineFrameInfo &MFI = MF->getFrameInfo();
26207 MachineRegisterInfo *MRI = &MF->getRegInfo();
26208 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26209 int FI = MFI.getFunctionContextIndex();
26211 // Get a mapping of the call site numbers to all of the landing pads they're
26212 // associated with.
26213 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26214 unsigned MaxCSNum = 0;
26215 for (auto &MBB : *MF) {
26216 if (!MBB.isEHPad())
26219 MCSymbol *Sym = nullptr;
26220 for (const auto &MI : MBB) {
26221 if (MI.isDebugValue())
26224 assert(MI.isEHLabel() && "expected EH_LABEL");
26225 Sym = MI.getOperand(0).getMCSymbol();
26229 if (!MF->hasCallSiteLandingPad(Sym))
26232 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26233 CallSiteNumToLPad[CSI].push_back(&MBB);
26234 MaxCSNum = std::max(MaxCSNum, CSI);
26238 // Get an ordered list of the machine basic blocks for the jump table.
26239 std::vector<MachineBasicBlock *> LPadList;
26240 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26241 LPadList.reserve(CallSiteNumToLPad.size());
26243 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26244 for (auto &LP : CallSiteNumToLPad[CSI]) {
26245 LPadList.push_back(LP);
26246 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26250 assert(!LPadList.empty() &&
26251 "No landing pad destinations for the dispatch jump table!");
26253 // Create the MBBs for the dispatch code.
26255 // Shove the dispatch's address into the return slot in the function context.
26256 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26257 DispatchBB->setIsEHPad(true);
26259 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26260 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26261 DispatchBB->addSuccessor(TrapBB);
26263 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26264 DispatchBB->addSuccessor(DispContBB);
26267 MF->push_back(DispatchBB);
26268 MF->push_back(DispContBB);
26269 MF->push_back(TrapBB);
26271 // Insert code into the entry block that creates and registers the function
26273 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26275 // Create the jump table and associated information
26276 MachineJumpTableInfo *JTI =
26277 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26278 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26280 const X86RegisterInfo &RI = TII->getRegisterInfo();
26281 // Add a register mask with no preserved registers. This results in all
26282 // registers being marked as clobbered.
26283 if (RI.hasBasePointer(*MF)) {
26284 const bool FPIs64Bit =
26285 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26286 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26287 MFI->setRestoreBasePointer(MF);
26289 unsigned FP = RI.getFrameRegister(*MF);
26290 unsigned BP = RI.getBaseRegister();
26291 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26292 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26293 MFI->getRestoreBasePointerOffset())
26294 .addRegMask(RI.getNoPreservedMask());
26296 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26297 .addRegMask(RI.getNoPreservedMask());
26300 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26301 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26303 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26305 .addImm(LPadList.size());
26306 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26308 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26309 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26312 BuildMI(DispContBB, DL,
26313 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26315 .addImm(Subtarget.is64Bit() ? 8 : 4)
26317 .addJumpTableIndex(MJTI)
26320 // Add the jump table entries as successors to the MBB.
26321 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26322 for (auto &LP : LPadList)
26323 if (SeenMBBs.insert(LP).second)
26324 DispContBB->addSuccessor(LP);
26326 // N.B. the order the invoke BBs are processed in doesn't matter here.
26327 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26328 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26329 for (MachineBasicBlock *MBB : InvokeBBs) {
26330 // Remove the landing pad successor from the invoke block and replace it
26331 // with the new dispatch block.
26332 // Keep a copy of Successors since it's modified inside the loop.
26333 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26335 // FIXME: Avoid quadratic complexity.
26336 for (auto MBBS : Successors) {
26337 if (MBBS->isEHPad()) {
26338 MBB->removeSuccessor(MBBS);
26339 MBBLPads.push_back(MBBS);
26343 MBB->addSuccessor(DispatchBB);
26345 // Find the invoke call and mark all of the callee-saved registers as
26346 // 'implicit defined' so that they're spilled. This prevents code from
26347 // moving instructions to before the EH block, where they will never be
26349 for (auto &II : reverse(*MBB)) {
26353 DenseMap<unsigned, bool> DefRegs;
26354 for (auto &MOp : II.operands())
26356 DefRegs[MOp.getReg()] = true;
26358 MachineInstrBuilder MIB(*MF, &II);
26359 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26360 unsigned Reg = SavedRegs[RI];
26362 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26369 // Mark all former landing pads as non-landing pads. The dispatch is the only
26370 // landing pad now.
26371 for (auto &LP : MBBLPads)
26372 LP->setIsEHPad(false);
26374 // The instruction is gone now.
26375 MI.eraseFromParent();
26379 MachineBasicBlock *
26380 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26381 MachineBasicBlock *BB) const {
26382 MachineFunction *MF = BB->getParent();
26383 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26384 DebugLoc DL = MI.getDebugLoc();
26386 switch (MI.getOpcode()) {
26387 default: llvm_unreachable("Unexpected instr type to insert");
26388 case X86::TAILJMPd64:
26389 case X86::TAILJMPr64:
26390 case X86::TAILJMPm64:
26391 case X86::TAILJMPr64_REX:
26392 case X86::TAILJMPm64_REX:
26393 llvm_unreachable("TAILJMP64 would not be touched here.");
26394 case X86::TCRETURNdi64:
26395 case X86::TCRETURNri64:
26396 case X86::TCRETURNmi64:
26398 case X86::TLS_addr32:
26399 case X86::TLS_addr64:
26400 case X86::TLS_base_addr32:
26401 case X86::TLS_base_addr64:
26402 return EmitLoweredTLSAddr(MI, BB);
26403 case X86::CATCHRET:
26404 return EmitLoweredCatchRet(MI, BB);
26405 case X86::CATCHPAD:
26406 return EmitLoweredCatchPad(MI, BB);
26407 case X86::SEG_ALLOCA_32:
26408 case X86::SEG_ALLOCA_64:
26409 return EmitLoweredSegAlloca(MI, BB);
26410 case X86::TLSCall_32:
26411 case X86::TLSCall_64:
26412 return EmitLoweredTLSCall(MI, BB);
26413 case X86::CMOV_FR32:
26414 case X86::CMOV_FR64:
26415 case X86::CMOV_FR128:
26416 case X86::CMOV_GR8:
26417 case X86::CMOV_GR16:
26418 case X86::CMOV_GR32:
26419 case X86::CMOV_RFP32:
26420 case X86::CMOV_RFP64:
26421 case X86::CMOV_RFP80:
26422 case X86::CMOV_V2F64:
26423 case X86::CMOV_V2I64:
26424 case X86::CMOV_V4F32:
26425 case X86::CMOV_V4F64:
26426 case X86::CMOV_V4I64:
26427 case X86::CMOV_V16F32:
26428 case X86::CMOV_V8F32:
26429 case X86::CMOV_V8F64:
26430 case X86::CMOV_V8I64:
26431 case X86::CMOV_V8I1:
26432 case X86::CMOV_V16I1:
26433 case X86::CMOV_V32I1:
26434 case X86::CMOV_V64I1:
26435 return EmitLoweredSelect(MI, BB);
26437 case X86::RDFLAGS32:
26438 case X86::RDFLAGS64: {
26440 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26441 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26442 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26443 // Permit reads of the FLAGS register without it being defined.
26444 // This intrinsic exists to read external processor state in flags, such as
26445 // the trap flag, interrupt flag, and direction flag, none of which are
26446 // modeled by the backend.
26447 Push->getOperand(2).setIsUndef();
26448 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26450 MI.eraseFromParent(); // The pseudo is gone now.
26454 case X86::WRFLAGS32:
26455 case X86::WRFLAGS64: {
26457 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26459 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26460 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26461 BuildMI(*BB, MI, DL, TII->get(PopF));
26463 MI.eraseFromParent(); // The pseudo is gone now.
26467 case X86::RELEASE_FADD32mr:
26468 case X86::RELEASE_FADD64mr:
26469 return EmitLoweredAtomicFP(MI, BB);
26471 case X86::FP32_TO_INT16_IN_MEM:
26472 case X86::FP32_TO_INT32_IN_MEM:
26473 case X86::FP32_TO_INT64_IN_MEM:
26474 case X86::FP64_TO_INT16_IN_MEM:
26475 case X86::FP64_TO_INT32_IN_MEM:
26476 case X86::FP64_TO_INT64_IN_MEM:
26477 case X86::FP80_TO_INT16_IN_MEM:
26478 case X86::FP80_TO_INT32_IN_MEM:
26479 case X86::FP80_TO_INT64_IN_MEM: {
26480 // Change the floating point control register to use "round towards zero"
26481 // mode when truncating to an integer value.
26482 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26483 addFrameReference(BuildMI(*BB, MI, DL,
26484 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26486 // Load the old value of the high byte of the control word...
26488 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26489 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26492 // Set the high part to be round to zero...
26493 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26496 // Reload the modified control word now...
26497 addFrameReference(BuildMI(*BB, MI, DL,
26498 TII->get(X86::FLDCW16m)), CWFrameIdx);
26500 // Restore the memory image of control word to original value
26501 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26504 // Get the X86 opcode to use.
26506 switch (MI.getOpcode()) {
26507 default: llvm_unreachable("illegal opcode!");
26508 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26509 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26510 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26511 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26512 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26513 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26514 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26515 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26516 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26519 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26520 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26521 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26523 // Reload the original control word now.
26524 addFrameReference(BuildMI(*BB, MI, DL,
26525 TII->get(X86::FLDCW16m)), CWFrameIdx);
26527 MI.eraseFromParent(); // The pseudo instruction is gone now.
26530 // String/text processing lowering.
26531 case X86::PCMPISTRM128REG:
26532 case X86::VPCMPISTRM128REG:
26533 case X86::PCMPISTRM128MEM:
26534 case X86::VPCMPISTRM128MEM:
26535 case X86::PCMPESTRM128REG:
26536 case X86::VPCMPESTRM128REG:
26537 case X86::PCMPESTRM128MEM:
26538 case X86::VPCMPESTRM128MEM:
26539 assert(Subtarget.hasSSE42() &&
26540 "Target must have SSE4.2 or AVX features enabled");
26541 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26543 // String/text processing lowering.
26544 case X86::PCMPISTRIREG:
26545 case X86::VPCMPISTRIREG:
26546 case X86::PCMPISTRIMEM:
26547 case X86::VPCMPISTRIMEM:
26548 case X86::PCMPESTRIREG:
26549 case X86::VPCMPESTRIREG:
26550 case X86::PCMPESTRIMEM:
26551 case X86::VPCMPESTRIMEM:
26552 assert(Subtarget.hasSSE42() &&
26553 "Target must have SSE4.2 or AVX features enabled");
26554 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26556 // Thread synchronization.
26558 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26559 case X86::MONITORX:
26560 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26564 return emitClzero(&MI, BB, Subtarget);
26568 return emitWRPKRU(MI, BB, Subtarget);
26570 return emitRDPKRU(MI, BB, Subtarget);
26573 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26575 case X86::VASTART_SAVE_XMM_REGS:
26576 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26578 case X86::VAARG_64:
26579 return EmitVAARG64WithCustomInserter(MI, BB);
26581 case X86::EH_SjLj_SetJmp32:
26582 case X86::EH_SjLj_SetJmp64:
26583 return emitEHSjLjSetJmp(MI, BB);
26585 case X86::EH_SjLj_LongJmp32:
26586 case X86::EH_SjLj_LongJmp64:
26587 return emitEHSjLjLongJmp(MI, BB);
26589 case X86::Int_eh_sjlj_setup_dispatch:
26590 return EmitSjLjDispatchBlock(MI, BB);
26592 case TargetOpcode::STATEPOINT:
26593 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26594 // this point in the process. We diverge later.
26595 return emitPatchPoint(MI, BB);
26597 case TargetOpcode::STACKMAP:
26598 case TargetOpcode::PATCHPOINT:
26599 return emitPatchPoint(MI, BB);
26601 case X86::LCMPXCHG8B: {
26602 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26603 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26604 // requires a memory operand. If it happens that current architecture is
26605 // i686 and for current function we need a base pointer
26606 // - which is ESI for i686 - register allocator would not be able to
26607 // allocate registers for an address in form of X(%reg, %reg, Y)
26608 // - there never would be enough unreserved registers during regalloc
26609 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26610 // We are giving a hand to register allocator by precomputing the address in
26611 // a new vreg using LEA.
26613 // If it is not i686 or there is no base pointer - nothing to do here.
26614 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26617 // Even though this code does not necessarily needs the base pointer to
26618 // be ESI, we check for that. The reason: if this assert fails, there are
26619 // some changes happened in the compiler base pointer handling, which most
26620 // probably have to be addressed somehow here.
26621 assert(TRI->getBaseRegister() == X86::ESI &&
26622 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26623 "base pointer in mind");
26625 MachineRegisterInfo &MRI = MF->getRegInfo();
26626 MVT SPTy = getPointerTy(MF->getDataLayout());
26627 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26628 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26630 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26631 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26632 // does not use index register.
26633 if (AM.IndexReg == X86::NoRegister)
26636 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26637 // four operand definitions that are E[ABCD] registers. We skip them and
26638 // then insert the LEA.
26639 MachineBasicBlock::iterator MBBI(MI);
26640 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26641 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26644 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26646 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26650 case X86::LCMPXCHG16B:
26652 case X86::LCMPXCHG8B_SAVE_EBX:
26653 case X86::LCMPXCHG16B_SAVE_RBX: {
26655 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26656 if (!BB->isLiveIn(BasePtr))
26657 BB->addLiveIn(BasePtr);
26663 //===----------------------------------------------------------------------===//
26664 // X86 Optimization Hooks
26665 //===----------------------------------------------------------------------===//
26667 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26670 const APInt &DemandedElts,
26671 const SelectionDAG &DAG,
26672 unsigned Depth) const {
26673 unsigned BitWidth = KnownZero.getBitWidth();
26674 unsigned Opc = Op.getOpcode();
26675 EVT VT = Op.getValueType();
26676 assert((Opc >= ISD::BUILTIN_OP_END ||
26677 Opc == ISD::INTRINSIC_WO_CHAIN ||
26678 Opc == ISD::INTRINSIC_W_CHAIN ||
26679 Opc == ISD::INTRINSIC_VOID) &&
26680 "Should use MaskedValueIsZero if you don't know whether Op"
26681 " is a target node!");
26683 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
26697 // These nodes' second result is a boolean.
26698 if (Op.getResNo() == 0)
26701 case X86ISD::SETCC:
26702 KnownZero.setBits(1, BitWidth);
26704 case X86ISD::MOVMSK: {
26705 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26706 KnownZero.setBits(NumLoBits, BitWidth);
26709 case X86ISD::VSHLI:
26710 case X86ISD::VSRLI: {
26711 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26712 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26713 KnownZero = APInt::getAllOnesValue(BitWidth);
26717 DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1);
26718 unsigned ShAmt = ShiftImm->getZExtValue();
26719 if (Opc == X86ISD::VSHLI) {
26720 KnownZero = KnownZero << ShAmt;
26721 KnownOne = KnownOne << ShAmt;
26722 // Low bits are known zero.
26723 KnownZero.setLowBits(ShAmt);
26725 KnownZero.lshrInPlace(ShAmt);
26726 KnownOne.lshrInPlace(ShAmt);
26727 // High bits are known zero.
26728 KnownZero.setHighBits(ShAmt);
26733 case X86ISD::VZEXT: {
26734 SDValue N0 = Op.getOperand(0);
26735 unsigned NumElts = VT.getVectorNumElements();
26737 EVT SrcVT = N0.getValueType();
26738 unsigned InNumElts = SrcVT.getVectorNumElements();
26739 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26740 assert(InNumElts >= NumElts && "Illegal VZEXT input");
26742 KnownZero = KnownOne = APInt(InBitWidth, 0);
26743 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26744 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1);
26745 KnownOne = KnownOne.zext(BitWidth);
26746 KnownZero = KnownZero.zext(BitWidth);
26747 KnownZero.setBits(InBitWidth, BitWidth);
26753 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26754 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26755 unsigned Depth) const {
26756 unsigned VTBits = Op.getScalarValueSizeInBits();
26757 unsigned Opcode = Op.getOpcode();
26759 case X86ISD::SETCC_CARRY:
26760 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26763 case X86ISD::VSEXT: {
26764 SDValue Src = Op.getOperand(0);
26765 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26766 Tmp += VTBits - Src.getScalarValueSizeInBits();
26770 case X86ISD::VSRAI: {
26771 SDValue Src = Op.getOperand(0);
26772 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26773 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26775 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26778 case X86ISD::PCMPGT:
26779 case X86ISD::PCMPEQ:
26781 case X86ISD::VPCOM:
26782 case X86ISD::VPCOMU:
26783 // Vector compares return zero/all-bits result values.
26791 /// Returns true (and the GlobalValue and the offset) if the node is a
26792 /// GlobalAddress + offset.
26793 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26794 const GlobalValue* &GA,
26795 int64_t &Offset) const {
26796 if (N->getOpcode() == X86ISD::Wrapper) {
26797 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26798 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26799 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26803 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26806 // Attempt to match a combined shuffle mask against supported unary shuffle
26808 // TODO: Investigate sharing more of this with shuffle lowering.
26809 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26810 bool AllowFloatDomain, bool AllowIntDomain,
26811 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26812 const X86Subtarget &Subtarget,
26813 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26814 unsigned NumMaskElts = Mask.size();
26815 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26817 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26818 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26819 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26820 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26821 unsigned MaxScale = 64 / MaskEltSize;
26822 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26824 unsigned NumDstElts = NumMaskElts / Scale;
26825 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26826 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26827 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26830 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26831 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26832 if (SrcVT != MaskVT)
26833 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26834 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26835 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26836 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26837 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26843 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26844 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26845 isUndefOrEqual(Mask[0], 0) &&
26846 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26847 Shuffle = X86ISD::VZEXT_MOVL;
26848 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26852 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26853 // instructions are no slower than UNPCKLPD but has the option to
26854 // fold the input operand into even an unaligned memory load.
26855 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26856 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26857 Shuffle = X86ISD::MOVDDUP;
26858 SrcVT = DstVT = MVT::v2f64;
26861 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26862 Shuffle = X86ISD::MOVSLDUP;
26863 SrcVT = DstVT = MVT::v4f32;
26866 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26867 Shuffle = X86ISD::MOVSHDUP;
26868 SrcVT = DstVT = MVT::v4f32;
26873 if (MaskVT.is256BitVector() && AllowFloatDomain) {
26874 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26875 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26876 Shuffle = X86ISD::MOVDDUP;
26877 SrcVT = DstVT = MVT::v4f64;
26880 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26881 Shuffle = X86ISD::MOVSLDUP;
26882 SrcVT = DstVT = MVT::v8f32;
26885 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26886 Shuffle = X86ISD::MOVSHDUP;
26887 SrcVT = DstVT = MVT::v8f32;
26892 if (MaskVT.is512BitVector() && AllowFloatDomain) {
26893 assert(Subtarget.hasAVX512() &&
26894 "AVX512 required for 512-bit vector shuffles");
26895 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26896 Shuffle = X86ISD::MOVDDUP;
26897 SrcVT = DstVT = MVT::v8f64;
26900 if (isTargetShuffleEquivalent(
26901 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26902 Shuffle = X86ISD::MOVSLDUP;
26903 SrcVT = DstVT = MVT::v16f32;
26906 if (isTargetShuffleEquivalent(
26907 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26908 Shuffle = X86ISD::MOVSHDUP;
26909 SrcVT = DstVT = MVT::v16f32;
26914 // Attempt to match against broadcast-from-vector.
26915 if (Subtarget.hasAVX2()) {
26916 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26917 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26918 SrcVT = DstVT = MaskVT;
26919 Shuffle = X86ISD::VBROADCAST;
26927 // Attempt to match a combined shuffle mask against supported unary immediate
26928 // permute instructions.
26929 // TODO: Investigate sharing more of this with shuffle lowering.
26930 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26931 bool AllowFloatDomain,
26932 bool AllowIntDomain,
26933 const X86Subtarget &Subtarget,
26934 unsigned &Shuffle, MVT &ShuffleVT,
26935 unsigned &PermuteImm) {
26936 unsigned NumMaskElts = Mask.size();
26938 bool ContainsZeros = false;
26939 APInt Zeroable(NumMaskElts, false);
26940 for (unsigned i = 0; i != NumMaskElts; ++i) {
26942 if (isUndefOrZero(M))
26943 Zeroable.setBit(i);
26944 ContainsZeros |= (M == SM_SentinelZero);
26947 // Attempt to match against byte/bit shifts.
26948 // FIXME: Add 512-bit support.
26949 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26950 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26951 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26952 MaskVT.getScalarSizeInBits(), Mask,
26953 0, Zeroable, Subtarget);
26954 if (0 < ShiftAmt) {
26955 PermuteImm = (unsigned)ShiftAmt;
26960 // Ensure we don't contain any zero elements.
26964 assert(llvm::all_of(Mask, [&](int M) {
26965 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26966 }) && "Expected unary shuffle");
26968 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26969 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26970 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26972 // Handle PSHUFLW/PSHUFHW repeated patterns.
26973 if (MaskScalarSizeInBits == 16) {
26974 SmallVector<int, 4> RepeatedMask;
26975 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26976 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26977 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26979 // PSHUFLW: permute lower 4 elements only.
26980 if (isUndefOrInRange(LoMask, 0, 4) &&
26981 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26982 Shuffle = X86ISD::PSHUFLW;
26983 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26984 PermuteImm = getV4X86ShuffleImm(LoMask);
26988 // PSHUFHW: permute upper 4 elements only.
26989 if (isUndefOrInRange(HiMask, 4, 8) &&
26990 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26991 // Offset the HiMask so that we can create the shuffle immediate.
26992 int OffsetHiMask[4];
26993 for (int i = 0; i != 4; ++i)
26994 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26996 Shuffle = X86ISD::PSHUFHW;
26997 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26998 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27007 // We only support permutation of 32/64 bit elements after this.
27008 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
27011 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27012 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27013 if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
27016 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
27017 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
27018 AllowFloatDomain = true;
27019 AllowIntDomain = false;
27022 // Check for lane crossing permutes.
27023 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27024 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27025 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
27026 Shuffle = X86ISD::VPERMI;
27027 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27028 PermuteImm = getV4X86ShuffleImm(Mask);
27031 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
27032 SmallVector<int, 4> RepeatedMask;
27033 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27034 Shuffle = X86ISD::VPERMI;
27035 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27036 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27043 // VPERMILPD can permute with a non-repeating shuffle.
27044 if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27045 Shuffle = X86ISD::VPERMILPI;
27046 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27048 for (int i = 0, e = Mask.size(); i != e; ++i) {
27050 if (M == SM_SentinelUndef)
27052 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27053 PermuteImm |= (M & 1) << i;
27058 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27059 SmallVector<int, 4> RepeatedMask;
27060 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27063 // Narrow the repeated mask for 32-bit element permutes.
27064 SmallVector<int, 4> WordMask = RepeatedMask;
27065 if (MaskScalarSizeInBits == 64)
27066 scaleShuffleMask(2, RepeatedMask, WordMask);
27068 Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27069 ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27070 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27071 PermuteImm = getV4X86ShuffleImm(WordMask);
27075 // Attempt to match a combined unary shuffle mask against supported binary
27076 // shuffle instructions.
27077 // TODO: Investigate sharing more of this with shuffle lowering.
27078 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27079 bool AllowFloatDomain, bool AllowIntDomain,
27080 SDValue &V1, SDValue &V2, SDLoc &DL,
27082 const X86Subtarget &Subtarget,
27083 unsigned &Shuffle, MVT &ShuffleVT,
27085 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27087 if (MaskVT.is128BitVector()) {
27088 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27090 Shuffle = X86ISD::MOVLHPS;
27091 ShuffleVT = MVT::v4f32;
27094 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27096 Shuffle = X86ISD::MOVHLPS;
27097 ShuffleVT = MVT::v4f32;
27100 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27101 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27103 Shuffle = X86ISD::MOVSD;
27104 ShuffleVT = MaskVT;
27107 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27108 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27109 Shuffle = X86ISD::MOVSS;
27110 ShuffleVT = MaskVT;
27115 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27116 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27117 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27118 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27119 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27120 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27121 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27123 ShuffleVT = MaskVT;
27124 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27125 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27133 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27134 bool AllowFloatDomain,
27135 bool AllowIntDomain,
27136 SDValue &V1, SDValue &V2, SDLoc &DL,
27138 const X86Subtarget &Subtarget,
27139 unsigned &Shuffle, MVT &ShuffleVT,
27140 unsigned &PermuteImm) {
27141 unsigned NumMaskElts = Mask.size();
27142 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27144 // Attempt to match against PALIGNR byte rotate.
27145 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27146 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27147 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27148 if (0 < ByteRotation) {
27149 Shuffle = X86ISD::PALIGNR;
27150 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27151 PermuteImm = ByteRotation;
27156 // Attempt to combine to X86ISD::BLENDI.
27157 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27158 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27159 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27160 uint64_t BlendMask = 0;
27161 bool ForceV1Zero = false, ForceV2Zero = false;
27162 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27163 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27165 if (MaskVT == MVT::v16i16) {
27166 // We can only use v16i16 PBLENDW if the lanes are repeated.
27167 SmallVector<int, 8> RepeatedMask;
27168 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27170 assert(RepeatedMask.size() == 8 &&
27171 "Repeated mask size doesn't match!");
27173 for (int i = 0; i < 8; ++i)
27174 if (RepeatedMask[i] >= 8)
27175 PermuteImm |= 1 << i;
27176 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27177 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27178 Shuffle = X86ISD::BLENDI;
27179 ShuffleVT = MaskVT;
27183 // Determine a type compatible with X86ISD::BLENDI.
27184 ShuffleVT = MaskVT;
27185 if (Subtarget.hasAVX2()) {
27186 if (ShuffleVT == MVT::v4i64)
27187 ShuffleVT = MVT::v8i32;
27188 else if (ShuffleVT == MVT::v2i64)
27189 ShuffleVT = MVT::v4i32;
27191 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27192 ShuffleVT = MVT::v8i16;
27193 else if (ShuffleVT == MVT::v4i64)
27194 ShuffleVT = MVT::v4f64;
27195 else if (ShuffleVT == MVT::v8i32)
27196 ShuffleVT = MVT::v8f32;
27199 if (!ShuffleVT.isFloatingPoint()) {
27200 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27202 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27203 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27204 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27207 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27208 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27209 PermuteImm = (unsigned)BlendMask;
27210 Shuffle = X86ISD::BLENDI;
27216 // Attempt to combine to INSERTPS.
27217 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27218 MaskVT.is128BitVector()) {
27219 APInt Zeroable(4, 0);
27220 for (unsigned i = 0; i != NumMaskElts; ++i)
27222 Zeroable.setBit(i);
27224 if (Zeroable.getBoolValue() &&
27225 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27226 Shuffle = X86ISD::INSERTPS;
27227 ShuffleVT = MVT::v4f32;
27232 // Attempt to combine to SHUFPD.
27233 if (AllowFloatDomain && EltSizeInBits == 64 &&
27234 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27235 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27236 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27237 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27238 Shuffle = X86ISD::SHUFP;
27239 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27244 // Attempt to combine to SHUFPS.
27245 if (AllowFloatDomain && EltSizeInBits == 32 &&
27246 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27247 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27248 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27249 SmallVector<int, 4> RepeatedMask;
27250 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27251 // Match each half of the repeated mask, to determine if its just
27252 // referencing one of the vectors, is zeroable or entirely undef.
27253 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27254 int M0 = RepeatedMask[Offset];
27255 int M1 = RepeatedMask[Offset + 1];
27257 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27258 return DAG.getUNDEF(MaskVT);
27259 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27260 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27261 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27262 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27263 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27264 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27265 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27267 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27268 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27269 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27276 int ShufMask[4] = {-1, -1, -1, -1};
27277 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27278 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27283 Shuffle = X86ISD::SHUFP;
27284 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27285 PermuteImm = getV4X86ShuffleImm(ShufMask);
27294 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27297 /// This is the leaf of the recursive combine below. When we have found some
27298 /// chain of single-use x86 shuffle instructions and accumulated the combined
27299 /// shuffle mask represented by them, this will try to pattern match that mask
27300 /// into either a single instruction if there is a special purpose instruction
27301 /// for this operation, or into a PSHUFB instruction which is a fully general
27302 /// instruction but should only be used to replace chains over a certain depth.
27303 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27304 ArrayRef<int> BaseMask, int Depth,
27305 bool HasVariableMask, SelectionDAG &DAG,
27306 TargetLowering::DAGCombinerInfo &DCI,
27307 const X86Subtarget &Subtarget) {
27308 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27309 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27310 "Unexpected number of shuffle inputs!");
27312 // Find the inputs that enter the chain. Note that multiple uses are OK
27313 // here, we're not going to remove the operands we find.
27314 bool UnaryShuffle = (Inputs.size() == 1);
27315 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27316 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27317 : peekThroughBitcasts(Inputs[1]));
27319 MVT VT1 = V1.getSimpleValueType();
27320 MVT VT2 = V2.getSimpleValueType();
27321 MVT RootVT = Root.getSimpleValueType();
27322 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27323 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27324 "Vector size mismatch");
27329 unsigned NumBaseMaskElts = BaseMask.size();
27330 if (NumBaseMaskElts == 1) {
27331 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27332 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27337 unsigned RootSizeInBits = RootVT.getSizeInBits();
27338 unsigned NumRootElts = RootVT.getVectorNumElements();
27339 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27340 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27341 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27343 // Don't combine if we are a AVX512/EVEX target and the mask element size
27344 // is different from the root element size - this would prevent writemasks
27345 // from being reused.
27346 // TODO - this currently prevents all lane shuffles from occurring.
27347 // TODO - check for writemasks usage instead of always preventing combining.
27348 // TODO - attempt to narrow Mask back to writemask size.
27349 bool IsEVEXShuffle =
27350 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27351 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27354 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27356 // Handle 128-bit lane shuffles of 256-bit vectors.
27357 // TODO - this should support binary shuffles.
27358 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27359 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27360 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27361 return false; // Nothing to do!
27362 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27363 unsigned PermMask = 0;
27364 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27365 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27367 Res = DAG.getBitcast(ShuffleVT, V1);
27368 DCI.AddToWorklist(Res.getNode());
27369 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27370 DAG.getUNDEF(ShuffleVT),
27371 DAG.getConstant(PermMask, DL, MVT::i8));
27372 DCI.AddToWorklist(Res.getNode());
27373 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27378 // For masks that have been widened to 128-bit elements or more,
27379 // narrow back down to 64-bit elements.
27380 SmallVector<int, 64> Mask;
27381 if (BaseMaskEltSizeInBits > 64) {
27382 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27383 int MaskScale = BaseMaskEltSizeInBits / 64;
27384 scaleShuffleMask(MaskScale, BaseMask, Mask);
27386 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27389 unsigned NumMaskElts = Mask.size();
27390 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27392 // Determine the effective mask value type.
27393 FloatDomain &= (32 <= MaskEltSizeInBits);
27394 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27395 : MVT::getIntegerVT(MaskEltSizeInBits);
27396 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27398 // Only allow legal mask types.
27399 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27402 // Attempt to match the mask against known shuffle patterns.
27403 MVT ShuffleSrcVT, ShuffleVT;
27404 unsigned Shuffle, PermuteImm;
27406 // Which shuffle domains are permitted?
27407 // Permit domain crossing at higher combine depths.
27408 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27409 bool AllowIntDomain = !FloatDomain || (Depth > 3);
27411 if (UnaryShuffle) {
27412 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27413 // directly if we don't shuffle the lower element and we shuffle the upper
27414 // (zero) elements within themselves.
27415 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27416 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27417 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27418 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27419 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27420 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27421 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27427 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27428 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27430 if (Depth == 1 && Root.getOpcode() == Shuffle)
27431 return false; // Nothing to do!
27432 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27433 return false; // AVX512 Writemask clash.
27434 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27435 DCI.AddToWorklist(Res.getNode());
27436 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27437 DCI.AddToWorklist(Res.getNode());
27438 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27443 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27444 AllowIntDomain, Subtarget, Shuffle,
27445 ShuffleVT, PermuteImm)) {
27446 if (Depth == 1 && Root.getOpcode() == Shuffle)
27447 return false; // Nothing to do!
27448 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27449 return false; // AVX512 Writemask clash.
27450 Res = DAG.getBitcast(ShuffleVT, V1);
27451 DCI.AddToWorklist(Res.getNode());
27452 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27453 DAG.getConstant(PermuteImm, DL, MVT::i8));
27454 DCI.AddToWorklist(Res.getNode());
27455 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27461 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27462 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27464 if (Depth == 1 && Root.getOpcode() == Shuffle)
27465 return false; // Nothing to do!
27466 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27467 return false; // AVX512 Writemask clash.
27468 V1 = DAG.getBitcast(ShuffleVT, V1);
27469 DCI.AddToWorklist(V1.getNode());
27470 V2 = DAG.getBitcast(ShuffleVT, V2);
27471 DCI.AddToWorklist(V2.getNode());
27472 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27473 DCI.AddToWorklist(Res.getNode());
27474 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27479 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27480 AllowIntDomain, V1, V2, DL, DAG,
27481 Subtarget, Shuffle, ShuffleVT,
27483 if (Depth == 1 && Root.getOpcode() == Shuffle)
27484 return false; // Nothing to do!
27485 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27486 return false; // AVX512 Writemask clash.
27487 V1 = DAG.getBitcast(ShuffleVT, V1);
27488 DCI.AddToWorklist(V1.getNode());
27489 V2 = DAG.getBitcast(ShuffleVT, V2);
27490 DCI.AddToWorklist(V2.getNode());
27491 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27492 DAG.getConstant(PermuteImm, DL, MVT::i8));
27493 DCI.AddToWorklist(Res.getNode());
27494 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27499 // Don't try to re-form single instruction chains under any circumstances now
27500 // that we've done encoding canonicalization for them.
27504 bool MaskContainsZeros =
27505 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27507 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27508 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27509 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27510 ((Subtarget.hasAVX2() &&
27511 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27512 (Subtarget.hasAVX512() &&
27513 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27514 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27515 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27516 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27517 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27518 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27519 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27520 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27521 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27522 DCI.AddToWorklist(VPermMask.getNode());
27523 Res = DAG.getBitcast(MaskVT, V1);
27524 DCI.AddToWorklist(Res.getNode());
27525 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27526 DCI.AddToWorklist(Res.getNode());
27527 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27532 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27533 // vector as the second source.
27534 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27535 ((Subtarget.hasAVX512() &&
27536 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27537 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27538 (Subtarget.hasVLX() &&
27539 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27540 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27541 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27542 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27543 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27544 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27545 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27546 for (unsigned i = 0; i != NumMaskElts; ++i)
27547 if (Mask[i] == SM_SentinelZero)
27548 Mask[i] = NumMaskElts + i;
27550 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27551 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27552 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27553 DCI.AddToWorklist(VPermMask.getNode());
27554 Res = DAG.getBitcast(MaskVT, V1);
27555 DCI.AddToWorklist(Res.getNode());
27556 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27557 DCI.AddToWorklist(Zero.getNode());
27558 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27559 DCI.AddToWorklist(Res.getNode());
27560 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27565 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27566 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27567 ((Subtarget.hasAVX512() &&
27568 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27569 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27570 (Subtarget.hasVLX() &&
27571 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27572 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27573 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27574 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27575 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27576 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27577 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27578 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27579 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27580 DCI.AddToWorklist(VPermMask.getNode());
27581 V1 = DAG.getBitcast(MaskVT, V1);
27582 DCI.AddToWorklist(V1.getNode());
27583 V2 = DAG.getBitcast(MaskVT, V2);
27584 DCI.AddToWorklist(V2.getNode());
27585 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27586 DCI.AddToWorklist(Res.getNode());
27587 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27594 // See if we can combine a single input shuffle with zeros to a bit-mask,
27595 // which is much simpler than any shuffle.
27596 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27597 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27598 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27599 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27600 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27601 APInt UndefElts(NumMaskElts, 0);
27602 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27603 for (unsigned i = 0; i != NumMaskElts; ++i) {
27605 if (M == SM_SentinelUndef) {
27606 UndefElts.setBit(i);
27609 if (M == SM_SentinelZero)
27611 EltBits[i] = AllOnes;
27613 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27614 DCI.AddToWorklist(BitMask.getNode());
27615 Res = DAG.getBitcast(MaskVT, V1);
27616 DCI.AddToWorklist(Res.getNode());
27617 unsigned AndOpcode =
27618 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27619 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27620 DCI.AddToWorklist(Res.getNode());
27621 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27626 // If we have a single input shuffle with different shuffle patterns in the
27627 // the 128-bit lanes use the variable mask to VPERMILPS.
27628 // TODO Combine other mask types at higher depths.
27629 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27630 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27631 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27632 SmallVector<SDValue, 16> VPermIdx;
27633 for (int M : Mask) {
27635 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27636 VPermIdx.push_back(Idx);
27638 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27639 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27640 DCI.AddToWorklist(VPermMask.getNode());
27641 Res = DAG.getBitcast(MaskVT, V1);
27642 DCI.AddToWorklist(Res.getNode());
27643 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27644 DCI.AddToWorklist(Res.getNode());
27645 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27650 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27651 // to VPERMIL2PD/VPERMIL2PS.
27652 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27653 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27654 MaskVT == MVT::v8f32)) {
27655 // VPERMIL2 Operation.
27656 // Bits[3] - Match Bit.
27657 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27658 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27659 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27660 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27661 SmallVector<int, 8> VPerm2Idx;
27662 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27663 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27664 unsigned M2ZImm = 0;
27665 for (int M : Mask) {
27666 if (M == SM_SentinelUndef) {
27667 VPerm2Idx.push_back(-1);
27670 if (M == SM_SentinelZero) {
27672 VPerm2Idx.push_back(8);
27675 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27676 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27677 VPerm2Idx.push_back(Index);
27679 V1 = DAG.getBitcast(MaskVT, V1);
27680 DCI.AddToWorklist(V1.getNode());
27681 V2 = DAG.getBitcast(MaskVT, V2);
27682 DCI.AddToWorklist(V2.getNode());
27683 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27684 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27685 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27686 DAG.getConstant(M2ZImm, DL, MVT::i8));
27687 DCI.AddToWorklist(Res.getNode());
27688 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27693 // If we have 3 or more shuffle instructions or a chain involving a variable
27694 // mask, we can replace them with a single PSHUFB instruction profitably.
27695 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27696 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27697 // more aggressive.
27698 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27699 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27700 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27701 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27702 SmallVector<SDValue, 16> PSHUFBMask;
27703 int NumBytes = RootVT.getSizeInBits() / 8;
27704 int Ratio = NumBytes / NumMaskElts;
27705 for (int i = 0; i < NumBytes; ++i) {
27706 int M = Mask[i / Ratio];
27707 if (M == SM_SentinelUndef) {
27708 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27711 if (M == SM_SentinelZero) {
27712 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27715 M = Ratio * M + i % Ratio;
27716 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27717 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27719 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27720 Res = DAG.getBitcast(ByteVT, V1);
27721 DCI.AddToWorklist(Res.getNode());
27722 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27723 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27724 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27725 DCI.AddToWorklist(Res.getNode());
27726 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27731 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27732 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27733 // slower than PSHUFB on targets that support both.
27734 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27735 Subtarget.hasXOP()) {
27736 // VPPERM Mask Operation
27737 // Bits[4:0] - Byte Index (0 - 31)
27738 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27739 SmallVector<SDValue, 16> VPPERMMask;
27741 int Ratio = NumBytes / NumMaskElts;
27742 for (int i = 0; i < NumBytes; ++i) {
27743 int M = Mask[i / Ratio];
27744 if (M == SM_SentinelUndef) {
27745 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27748 if (M == SM_SentinelZero) {
27749 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27752 M = Ratio * M + i % Ratio;
27753 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27755 MVT ByteVT = MVT::v16i8;
27756 V1 = DAG.getBitcast(ByteVT, V1);
27757 DCI.AddToWorklist(V1.getNode());
27758 V2 = DAG.getBitcast(ByteVT, V2);
27759 DCI.AddToWorklist(V2.getNode());
27760 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27761 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27762 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27763 DCI.AddToWorklist(Res.getNode());
27764 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27769 // Failed to find any combines.
27773 // Attempt to constant fold all of the constant source ops.
27774 // Returns true if the entire shuffle is folded to a constant.
27775 // TODO: Extend this to merge multiple constant Ops and update the mask.
27776 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27777 ArrayRef<int> Mask, SDValue Root,
27778 bool HasVariableMask, SelectionDAG &DAG,
27779 TargetLowering::DAGCombinerInfo &DCI,
27780 const X86Subtarget &Subtarget) {
27781 MVT VT = Root.getSimpleValueType();
27783 unsigned SizeInBits = VT.getSizeInBits();
27784 unsigned NumMaskElts = Mask.size();
27785 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27786 unsigned NumOps = Ops.size();
27788 // Extract constant bits from each source op.
27789 bool OneUseConstantOp = false;
27790 SmallVector<APInt, 16> UndefEltsOps(NumOps);
27791 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27792 for (unsigned i = 0; i != NumOps; ++i) {
27793 SDValue SrcOp = Ops[i];
27794 OneUseConstantOp |= SrcOp.hasOneUse();
27795 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27800 // Only fold if at least one of the constants is only used once or
27801 // the combined shuffle has included a variable mask shuffle, this
27802 // is to avoid constant pool bloat.
27803 if (!OneUseConstantOp && !HasVariableMask)
27806 // Shuffle the constant bits according to the mask.
27807 APInt UndefElts(NumMaskElts, 0);
27808 APInt ZeroElts(NumMaskElts, 0);
27809 APInt ConstantElts(NumMaskElts, 0);
27810 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27811 APInt::getNullValue(MaskSizeInBits));
27812 for (unsigned i = 0; i != NumMaskElts; ++i) {
27814 if (M == SM_SentinelUndef) {
27815 UndefElts.setBit(i);
27817 } else if (M == SM_SentinelZero) {
27818 ZeroElts.setBit(i);
27821 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27823 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27824 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27826 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27827 if (SrcUndefElts[SrcMaskIdx]) {
27828 UndefElts.setBit(i);
27832 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27833 APInt &Bits = SrcEltBits[SrcMaskIdx];
27835 ZeroElts.setBit(i);
27839 ConstantElts.setBit(i);
27840 ConstantBitData[i] = Bits;
27842 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27844 // Create the constant data.
27846 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27847 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27849 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27851 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27854 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27855 DCI.AddToWorklist(CstOp.getNode());
27856 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27860 /// \brief Fully generic combining of x86 shuffle instructions.
27862 /// This should be the last combine run over the x86 shuffle instructions. Once
27863 /// they have been fully optimized, this will recursively consider all chains
27864 /// of single-use shuffle instructions, build a generic model of the cumulative
27865 /// shuffle operation, and check for simpler instructions which implement this
27866 /// operation. We use this primarily for two purposes:
27868 /// 1) Collapse generic shuffles to specialized single instructions when
27869 /// equivalent. In most cases, this is just an encoding size win, but
27870 /// sometimes we will collapse multiple generic shuffles into a single
27871 /// special-purpose shuffle.
27872 /// 2) Look for sequences of shuffle instructions with 3 or more total
27873 /// instructions, and replace them with the slightly more expensive SSSE3
27874 /// PSHUFB instruction if available. We do this as the last combining step
27875 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27876 /// a suitable short sequence of other instructions. The PSHUFB will either
27877 /// use a register or have to read from memory and so is slightly (but only
27878 /// slightly) more expensive than the other shuffle instructions.
27880 /// Because this is inherently a quadratic operation (for each shuffle in
27881 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27882 /// This should never be an issue in practice as the shuffle lowering doesn't
27883 /// produce sequences of more than 8 instructions.
27885 /// FIXME: We will currently miss some cases where the redundant shuffling
27886 /// would simplify under the threshold for PSHUFB formation because of
27887 /// combine-ordering. To fix this, we should do the redundant instruction
27888 /// combining in this recursive walk.
27889 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27890 int SrcOpIndex, SDValue Root,
27891 ArrayRef<int> RootMask,
27892 ArrayRef<const SDNode*> SrcNodes,
27893 int Depth, bool HasVariableMask,
27895 TargetLowering::DAGCombinerInfo &DCI,
27896 const X86Subtarget &Subtarget) {
27897 // Bound the depth of our recursive combine because this is ultimately
27898 // quadratic in nature.
27902 // Directly rip through bitcasts to find the underlying operand.
27903 SDValue Op = SrcOps[SrcOpIndex];
27904 Op = peekThroughOneUseBitcasts(Op);
27906 MVT VT = Op.getSimpleValueType();
27907 if (!VT.isVector())
27908 return false; // Bail if we hit a non-vector.
27910 assert(Root.getSimpleValueType().isVector() &&
27911 "Shuffles operate on vector types!");
27912 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27913 "Can only combine shuffles of the same vector register size.");
27915 // Extract target shuffle mask and resolve sentinels and inputs.
27916 SmallVector<int, 64> OpMask;
27917 SmallVector<SDValue, 2> OpInputs;
27918 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27921 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27922 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27923 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27925 // Add the inputs to the Ops list, avoiding duplicates.
27926 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27928 int InputIdx0 = -1, InputIdx1 = -1;
27929 for (int i = 0, e = Ops.size(); i < e; ++i) {
27930 SDValue BC = peekThroughBitcasts(Ops[i]);
27931 if (Input0 && BC == peekThroughBitcasts(Input0))
27933 if (Input1 && BC == peekThroughBitcasts(Input1))
27937 if (Input0 && InputIdx0 < 0) {
27938 InputIdx0 = SrcOpIndex;
27939 Ops[SrcOpIndex] = Input0;
27941 if (Input1 && InputIdx1 < 0) {
27942 InputIdx1 = Ops.size();
27943 Ops.push_back(Input1);
27946 assert(((RootMask.size() > OpMask.size() &&
27947 RootMask.size() % OpMask.size() == 0) ||
27948 (OpMask.size() > RootMask.size() &&
27949 OpMask.size() % RootMask.size() == 0) ||
27950 OpMask.size() == RootMask.size()) &&
27951 "The smaller number of elements must divide the larger.");
27952 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27953 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27954 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27955 assert(((RootRatio == 1 && OpRatio == 1) ||
27956 (RootRatio == 1) != (OpRatio == 1)) &&
27957 "Must not have a ratio for both incoming and op masks!");
27959 SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27961 // Merge this shuffle operation's mask into our accumulated mask. Note that
27962 // this shuffle's mask will be the first applied to the input, followed by the
27963 // root mask to get us all the way to the root value arrangement. The reason
27964 // for this order is that we are recursing up the operation chain.
27965 for (int i = 0; i < MaskWidth; ++i) {
27966 int RootIdx = i / RootRatio;
27967 if (RootMask[RootIdx] < 0) {
27968 // This is a zero or undef lane, we're done.
27969 Mask[i] = RootMask[RootIdx];
27973 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27975 // Just insert the scaled root mask value if it references an input other
27976 // than the SrcOp we're currently inserting.
27977 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27978 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27979 Mask[i] = RootMaskedIdx;
27983 RootMaskedIdx %= MaskWidth;
27985 int OpIdx = RootMaskedIdx / OpRatio;
27986 if (OpMask[OpIdx] < 0) {
27987 // The incoming lanes are zero or undef, it doesn't matter which ones we
27989 Mask[i] = OpMask[OpIdx];
27993 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27994 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27995 OpMaskedIdx %= MaskWidth;
27997 if (OpMask[OpIdx] < (int)OpMask.size()) {
27998 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27999 OpMaskedIdx += InputIdx0 * MaskWidth;
28001 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28002 OpMaskedIdx += InputIdx1 * MaskWidth;
28005 Mask[i] = OpMaskedIdx;
28008 // Handle the all undef/zero cases early.
28009 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28010 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28013 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28014 // TODO - should we handle the mixed zero/undef case as well? Just returning
28015 // a zero mask will lose information on undef elements possibly reducing
28016 // future combine possibilities.
28017 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28018 Subtarget, DAG, SDLoc(Root)));
28022 // Remove unused shuffle source ops.
28023 resolveTargetShuffleInputsAndMask(Ops, Mask);
28024 assert(!Ops.empty() && "Shuffle with no inputs detected");
28026 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28028 // Update the list of shuffle nodes that have been combined so far.
28029 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28031 CombinedNodes.push_back(Op.getNode());
28033 // See if we can recurse into each shuffle source op (if it's a target
28034 // shuffle). The source op should only be combined if it either has a
28035 // single use (i.e. current Op) or all its users have already been combined.
28036 for (int i = 0, e = Ops.size(); i < e; ++i)
28037 if (Ops[i].getNode()->hasOneUse() ||
28038 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28039 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28040 Depth + 1, HasVariableMask, DAG, DCI,
28044 // Attempt to constant fold all of the constant source ops.
28045 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28049 // We can only combine unary and binary shuffle mask cases.
28050 if (Ops.size() > 2)
28053 // Minor canonicalization of the accumulated shuffle mask to make it easier
28054 // to match below. All this does is detect masks with sequential pairs of
28055 // elements, and shrink them to the half-width mask. It does this in a loop
28056 // so it will reduce the size of the mask to the minimal width mask which
28057 // performs an equivalent shuffle.
28058 SmallVector<int, 64> WidenedMask;
28059 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28060 Mask = std::move(WidenedMask);
28063 // Canonicalization of binary shuffle masks to improve pattern matching by
28064 // commuting the inputs.
28065 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28066 ShuffleVectorSDNode::commuteMask(Mask);
28067 std::swap(Ops[0], Ops[1]);
28070 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28074 /// \brief Get the PSHUF-style mask from PSHUF node.
28076 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28077 /// PSHUF-style masks that can be reused with such instructions.
28078 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28079 MVT VT = N.getSimpleValueType();
28080 SmallVector<int, 4> Mask;
28081 SmallVector<SDValue, 2> Ops;
28084 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28088 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28089 // matter. Check that the upper masks are repeats and remove them.
28090 if (VT.getSizeInBits() > 128) {
28091 int LaneElts = 128 / VT.getScalarSizeInBits();
28093 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28094 for (int j = 0; j < LaneElts; ++j)
28095 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28096 "Mask doesn't repeat in high 128-bit lanes!");
28098 Mask.resize(LaneElts);
28101 switch (N.getOpcode()) {
28102 case X86ISD::PSHUFD:
28104 case X86ISD::PSHUFLW:
28107 case X86ISD::PSHUFHW:
28108 Mask.erase(Mask.begin(), Mask.begin() + 4);
28109 for (int &M : Mask)
28113 llvm_unreachable("No valid shuffle instruction found!");
28117 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28119 /// We walk up the chain and look for a combinable shuffle, skipping over
28120 /// shuffles that we could hoist this shuffle's transformation past without
28121 /// altering anything.
28123 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28124 SelectionDAG &DAG) {
28125 assert(N.getOpcode() == X86ISD::PSHUFD &&
28126 "Called with something other than an x86 128-bit half shuffle!");
28129 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28130 // of the shuffles in the chain so that we can form a fresh chain to replace
28132 SmallVector<SDValue, 8> Chain;
28133 SDValue V = N.getOperand(0);
28134 for (; V.hasOneUse(); V = V.getOperand(0)) {
28135 switch (V.getOpcode()) {
28137 return SDValue(); // Nothing combined!
28140 // Skip bitcasts as we always know the type for the target specific
28144 case X86ISD::PSHUFD:
28145 // Found another dword shuffle.
28148 case X86ISD::PSHUFLW:
28149 // Check that the low words (being shuffled) are the identity in the
28150 // dword shuffle, and the high words are self-contained.
28151 if (Mask[0] != 0 || Mask[1] != 1 ||
28152 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28155 Chain.push_back(V);
28158 case X86ISD::PSHUFHW:
28159 // Check that the high words (being shuffled) are the identity in the
28160 // dword shuffle, and the low words are self-contained.
28161 if (Mask[2] != 2 || Mask[3] != 3 ||
28162 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28165 Chain.push_back(V);
28168 case X86ISD::UNPCKL:
28169 case X86ISD::UNPCKH:
28170 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28171 // shuffle into a preceding word shuffle.
28172 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28173 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28176 // Search for a half-shuffle which we can combine with.
28177 unsigned CombineOp =
28178 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28179 if (V.getOperand(0) != V.getOperand(1) ||
28180 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28182 Chain.push_back(V);
28183 V = V.getOperand(0);
28185 switch (V.getOpcode()) {
28187 return SDValue(); // Nothing to combine.
28189 case X86ISD::PSHUFLW:
28190 case X86ISD::PSHUFHW:
28191 if (V.getOpcode() == CombineOp)
28194 Chain.push_back(V);
28198 V = V.getOperand(0);
28202 } while (V.hasOneUse());
28205 // Break out of the loop if we break out of the switch.
28209 if (!V.hasOneUse())
28210 // We fell out of the loop without finding a viable combining instruction.
28213 // Merge this node's mask and our incoming mask.
28214 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28215 for (int &M : Mask)
28217 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28218 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28220 // Rebuild the chain around this new shuffle.
28221 while (!Chain.empty()) {
28222 SDValue W = Chain.pop_back_val();
28224 if (V.getValueType() != W.getOperand(0).getValueType())
28225 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28227 switch (W.getOpcode()) {
28229 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28231 case X86ISD::UNPCKL:
28232 case X86ISD::UNPCKH:
28233 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28236 case X86ISD::PSHUFD:
28237 case X86ISD::PSHUFLW:
28238 case X86ISD::PSHUFHW:
28239 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28243 if (V.getValueType() != N.getValueType())
28244 V = DAG.getBitcast(N.getValueType(), V);
28246 // Return the new chain to replace N.
28250 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28253 /// We walk up the chain, skipping shuffles of the other half and looking
28254 /// through shuffles which switch halves trying to find a shuffle of the same
28255 /// pair of dwords.
28256 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28258 TargetLowering::DAGCombinerInfo &DCI) {
28260 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28261 "Called with something other than an x86 128-bit half shuffle!");
28263 unsigned CombineOpcode = N.getOpcode();
28265 // Walk up a single-use chain looking for a combinable shuffle.
28266 SDValue V = N.getOperand(0);
28267 for (; V.hasOneUse(); V = V.getOperand(0)) {
28268 switch (V.getOpcode()) {
28270 return false; // Nothing combined!
28273 // Skip bitcasts as we always know the type for the target specific
28277 case X86ISD::PSHUFLW:
28278 case X86ISD::PSHUFHW:
28279 if (V.getOpcode() == CombineOpcode)
28282 // Other-half shuffles are no-ops.
28285 // Break out of the loop if we break out of the switch.
28289 if (!V.hasOneUse())
28290 // We fell out of the loop without finding a viable combining instruction.
28293 // Combine away the bottom node as its shuffle will be accumulated into
28294 // a preceding shuffle.
28295 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28297 // Record the old value.
28300 // Merge this node's mask and our incoming mask (adjusted to account for all
28301 // the pshufd instructions encountered).
28302 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28303 for (int &M : Mask)
28305 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28306 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28308 // Check that the shuffles didn't cancel each other out. If not, we need to
28309 // combine to the new one.
28311 // Replace the combinable shuffle with the combined one, updating all users
28312 // so that we re-evaluate the chain here.
28313 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28318 /// \brief Try to combine x86 target specific shuffles.
28319 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28320 TargetLowering::DAGCombinerInfo &DCI,
28321 const X86Subtarget &Subtarget) {
28323 MVT VT = N.getSimpleValueType();
28324 SmallVector<int, 4> Mask;
28326 unsigned Opcode = N.getOpcode();
28328 case X86ISD::PSHUFD:
28329 case X86ISD::PSHUFLW:
28330 case X86ISD::PSHUFHW:
28331 Mask = getPSHUFShuffleMask(N);
28332 assert(Mask.size() == 4);
28334 case X86ISD::UNPCKL: {
28335 auto Op0 = N.getOperand(0);
28336 auto Op1 = N.getOperand(1);
28337 unsigned Opcode0 = Op0.getOpcode();
28338 unsigned Opcode1 = Op1.getOpcode();
28340 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28341 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28342 // TODO: Add other horizontal operations as required.
28343 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28344 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28346 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28347 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28348 // moves upper half elements into the lower half part. For example:
28350 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28352 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28354 // will be combined to:
28356 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28358 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28359 // happen due to advanced instructions.
28360 if (!VT.is128BitVector())
28363 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28364 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28366 unsigned NumElts = VT.getVectorNumElements();
28367 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28368 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28371 auto ShufOp = Op1.getOperand(0);
28372 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28373 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28377 case X86ISD::BLENDI: {
28378 SDValue V0 = N->getOperand(0);
28379 SDValue V1 = N->getOperand(1);
28380 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28381 "Unexpected input vector types");
28383 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28384 // operands and changing the mask to 1. This saves us a bunch of
28385 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28386 // x86InstrInfo knows how to commute this back after instruction selection
28387 // if it would help register allocation.
28389 // TODO: If optimizing for size or a processor that doesn't suffer from
28390 // partial register update stalls, this should be transformed into a MOVSD
28391 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28393 if (VT == MVT::v2f64)
28394 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28395 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28396 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28397 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28402 case X86ISD::MOVSD:
28403 case X86ISD::MOVSS: {
28404 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28405 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28406 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28407 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28408 if (isZero0 && isZero1)
28411 // We often lower to MOVSD/MOVSS from integer as well as native float
28412 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28413 // easier to combine shuffles later on. We've already accounted for the
28414 // domain switching cost when we decided to lower with it.
28415 bool isFloat = VT.isFloatingPoint();
28416 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28417 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28418 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28419 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28420 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28421 V0 = DAG.getBitcast(NewVT, V0);
28422 V1 = DAG.getBitcast(NewVT, V1);
28423 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28428 case X86ISD::INSERTPS: {
28429 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28430 SDValue Op0 = N.getOperand(0);
28431 SDValue Op1 = N.getOperand(1);
28432 SDValue Op2 = N.getOperand(2);
28433 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28434 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28435 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28436 unsigned ZeroMask = InsertPSMask & 0xF;
28438 // If we zero out all elements from Op0 then we don't need to reference it.
28439 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28440 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28441 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28443 // If we zero out the element from Op1 then we don't need to reference it.
28444 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28445 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28446 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28448 // Attempt to merge insertps Op1 with an inner target shuffle node.
28449 SmallVector<int, 8> TargetMask1;
28450 SmallVector<SDValue, 2> Ops1;
28451 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28452 int M = TargetMask1[SrcIdx];
28453 if (isUndefOrZero(M)) {
28454 // Zero/UNDEF insertion - zero out element and remove dependency.
28455 InsertPSMask |= (1u << DstIdx);
28456 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28457 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28459 // Update insertps mask srcidx and reference the source input directly.
28460 assert(0 <= M && M < 8 && "Shuffle index out of range");
28461 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28462 Op1 = Ops1[M < 4 ? 0 : 1];
28463 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28464 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28467 // Attempt to merge insertps Op0 with an inner target shuffle node.
28468 SmallVector<int, 8> TargetMask0;
28469 SmallVector<SDValue, 2> Ops0;
28470 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28473 bool Updated = false;
28474 bool UseInput00 = false;
28475 bool UseInput01 = false;
28476 for (int i = 0; i != 4; ++i) {
28477 int M = TargetMask0[i];
28478 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28479 // No change if element is already zero or the inserted element.
28481 } else if (isUndefOrZero(M)) {
28482 // If the target mask is undef/zero then we must zero the element.
28483 InsertPSMask |= (1u << i);
28488 // The input vector element must be inline.
28489 if (M != i && M != (i + 4))
28492 // Determine which inputs of the target shuffle we're using.
28493 UseInput00 |= (0 <= M && M < 4);
28494 UseInput01 |= (4 <= M);
28497 // If we're not using both inputs of the target shuffle then use the
28498 // referenced input directly.
28499 if (UseInput00 && !UseInput01) {
28502 } else if (!UseInput00 && UseInput01) {
28508 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28509 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28517 // Nuke no-op shuffles that show up after combining.
28518 if (isNoopShuffleMask(Mask))
28519 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28521 // Look for simplifications involving one or two shuffle instructions.
28522 SDValue V = N.getOperand(0);
28523 switch (N.getOpcode()) {
28526 case X86ISD::PSHUFLW:
28527 case X86ISD::PSHUFHW:
28528 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28530 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28531 return SDValue(); // We combined away this shuffle, so we're done.
28533 // See if this reduces to a PSHUFD which is no more expensive and can
28534 // combine with more operations. Note that it has to at least flip the
28535 // dwords as otherwise it would have been removed as a no-op.
28536 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28537 int DMask[] = {0, 1, 2, 3};
28538 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28539 DMask[DOffset + 0] = DOffset + 1;
28540 DMask[DOffset + 1] = DOffset + 0;
28541 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28542 V = DAG.getBitcast(DVT, V);
28543 DCI.AddToWorklist(V.getNode());
28544 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28545 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28546 DCI.AddToWorklist(V.getNode());
28547 return DAG.getBitcast(VT, V);
28550 // Look for shuffle patterns which can be implemented as a single unpack.
28551 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28552 // only works when we have a PSHUFD followed by two half-shuffles.
28553 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28554 (V.getOpcode() == X86ISD::PSHUFLW ||
28555 V.getOpcode() == X86ISD::PSHUFHW) &&
28556 V.getOpcode() != N.getOpcode() &&
28558 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28559 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28560 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28561 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28562 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28563 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28565 for (int i = 0; i < 4; ++i) {
28566 WordMask[i + NOffset] = Mask[i] + NOffset;
28567 WordMask[i + VOffset] = VMask[i] + VOffset;
28569 // Map the word mask through the DWord mask.
28571 for (int i = 0; i < 8; ++i)
28572 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28573 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28574 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28575 // We can replace all three shuffles with an unpack.
28576 V = DAG.getBitcast(VT, D.getOperand(0));
28577 DCI.AddToWorklist(V.getNode());
28578 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28587 case X86ISD::PSHUFD:
28588 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28597 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28598 /// operation. If true is returned then the operands of ADDSUB operation
28599 /// are written to the parameters \p Opnd0 and \p Opnd1.
28601 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28602 /// so it is easier to generically match. We also insert dummy vector shuffle
28603 /// nodes for the operands which explicitly discard the lanes which are unused
28604 /// by this operation to try to flow through the rest of the combiner
28605 /// the fact that they're unused.
28606 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28607 SDValue &Opnd0, SDValue &Opnd1) {
28609 EVT VT = N->getValueType(0);
28610 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28611 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28612 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28615 // We only handle target-independent shuffles.
28616 // FIXME: It would be easy and harmless to use the target shuffle mask
28617 // extraction tool to support more.
28618 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28621 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28622 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28624 SDValue V1 = N->getOperand(0);
28625 SDValue V2 = N->getOperand(1);
28627 // We require the first shuffle operand to be the FSUB node, and the second to
28628 // be the FADD node.
28629 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28630 ShuffleVectorSDNode::commuteMask(Mask);
28632 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28635 // If there are other uses of these operations we can't fold them.
28636 if (!V1->hasOneUse() || !V2->hasOneUse())
28639 // Ensure that both operations have the same operands. Note that we can
28640 // commute the FADD operands.
28641 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28642 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28643 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28646 // We're looking for blends between FADD and FSUB nodes. We insist on these
28647 // nodes being lined up in a specific expected pattern.
28648 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28649 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28650 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28651 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28652 8, 25, 10, 27, 12, 29, 14, 31})))
28660 /// \brief Try to combine a shuffle into a target-specific add-sub or
28661 /// mul-add-sub node.
28662 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28663 const X86Subtarget &Subtarget,
28664 SelectionDAG &DAG) {
28665 SDValue Opnd0, Opnd1;
28666 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28669 EVT VT = N->getValueType(0);
28672 // Try to generate X86ISD::FMADDSUB node here.
28674 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28675 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28677 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28678 // the ADDSUB idiom has been successfully recognized. There are no known
28679 // X86 targets with 512-bit ADDSUB instructions!
28680 if (VT.is512BitVector())
28683 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28686 // We are looking for a shuffle where both sources are concatenated with undef
28687 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28688 // if we can express this as a single-source shuffle, that's preferable.
28689 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28690 const X86Subtarget &Subtarget) {
28691 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28694 EVT VT = N->getValueType(0);
28696 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28697 if (!VT.is128BitVector() && !VT.is256BitVector())
28700 if (VT.getVectorElementType() != MVT::i32 &&
28701 VT.getVectorElementType() != MVT::i64 &&
28702 VT.getVectorElementType() != MVT::f32 &&
28703 VT.getVectorElementType() != MVT::f64)
28706 SDValue N0 = N->getOperand(0);
28707 SDValue N1 = N->getOperand(1);
28709 // Check that both sources are concats with undef.
28710 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28711 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28712 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28713 !N1.getOperand(1).isUndef())
28716 // Construct the new shuffle mask. Elements from the first source retain their
28717 // index, but elements from the second source no longer need to skip an undef.
28718 SmallVector<int, 8> Mask;
28719 int NumElts = VT.getVectorNumElements();
28721 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28722 for (int Elt : SVOp->getMask())
28723 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28726 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28728 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28731 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28732 TargetLowering::DAGCombinerInfo &DCI,
28733 const X86Subtarget &Subtarget) {
28735 EVT VT = N->getValueType(0);
28736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28737 // If we have legalized the vector types, look for blends of FADD and FSUB
28738 // nodes that we can fuse into an ADDSUB node.
28739 if (TLI.isTypeLegal(VT))
28740 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28743 // During Type Legalization, when promoting illegal vector types,
28744 // the backend might introduce new shuffle dag nodes and bitcasts.
28746 // This code performs the following transformation:
28747 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28748 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28750 // We do this only if both the bitcast and the BINOP dag nodes have
28751 // one use. Also, perform this transformation only if the new binary
28752 // operation is legal. This is to avoid introducing dag nodes that
28753 // potentially need to be further expanded (or custom lowered) into a
28754 // less optimal sequence of dag nodes.
28755 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28756 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28757 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28758 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28759 SDValue N0 = N->getOperand(0);
28760 SDValue N1 = N->getOperand(1);
28762 SDValue BC0 = N0.getOperand(0);
28763 EVT SVT = BC0.getValueType();
28764 unsigned Opcode = BC0.getOpcode();
28765 unsigned NumElts = VT.getVectorNumElements();
28767 if (BC0.hasOneUse() && SVT.isVector() &&
28768 SVT.getVectorNumElements() * 2 == NumElts &&
28769 TLI.isOperationLegal(Opcode, VT)) {
28770 bool CanFold = false;
28776 // isOperationLegal lies for integer ops on floating point types.
28777 CanFold = VT.isInteger();
28782 // isOperationLegal lies for floating point ops on integer types.
28783 CanFold = VT.isFloatingPoint();
28787 unsigned SVTNumElts = SVT.getVectorNumElements();
28788 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28789 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28790 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28791 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28792 CanFold = SVOp->getMaskElt(i) < 0;
28795 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28796 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28797 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28798 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28803 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28804 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28805 // consecutive, non-overlapping, and in the right order.
28806 SmallVector<SDValue, 16> Elts;
28807 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28808 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28809 Elts.push_back(Elt);
28816 if (Elts.size() == VT.getVectorNumElements())
28817 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28820 // For AVX2, we sometimes want to combine
28821 // (vector_shuffle <mask> (concat_vectors t1, undef)
28822 // (concat_vectors t2, undef))
28824 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28825 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28826 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28829 if (isTargetShuffle(N->getOpcode())) {
28831 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28834 // Try recursively combining arbitrary sequences of x86 shuffle
28835 // instructions into higher-order shuffles. We do this after combining
28836 // specific PSHUF instruction sequences into their minimal form so that we
28837 // can evaluate how many specialized shuffle instructions are involved in
28838 // a particular chain.
28839 SmallVector<int, 1> NonceMask; // Just a placeholder.
28840 NonceMask.push_back(0);
28841 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28842 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28844 return SDValue(); // This routine will use CombineTo to replace N.
28850 /// Check if a vector extract from a target-specific shuffle of a load can be
28851 /// folded into a single element load.
28852 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28853 /// shuffles have been custom lowered so we need to handle those here.
28854 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28855 TargetLowering::DAGCombinerInfo &DCI) {
28856 if (DCI.isBeforeLegalizeOps())
28859 SDValue InVec = N->getOperand(0);
28860 SDValue EltNo = N->getOperand(1);
28861 EVT EltVT = N->getValueType(0);
28863 if (!isa<ConstantSDNode>(EltNo))
28866 EVT OriginalVT = InVec.getValueType();
28868 // Peek through bitcasts, don't duplicate a load with other uses.
28869 InVec = peekThroughOneUseBitcasts(InVec);
28871 EVT CurrentVT = InVec.getValueType();
28872 if (!CurrentVT.isVector() ||
28873 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28876 if (!isTargetShuffle(InVec.getOpcode()))
28879 // Don't duplicate a load with other uses.
28880 if (!InVec.hasOneUse())
28883 SmallVector<int, 16> ShuffleMask;
28884 SmallVector<SDValue, 2> ShuffleOps;
28886 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28887 ShuffleOps, ShuffleMask, UnaryShuffle))
28890 // Select the input vector, guarding against out of range extract vector.
28891 unsigned NumElems = CurrentVT.getVectorNumElements();
28892 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28893 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28895 if (Idx == SM_SentinelZero)
28896 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28897 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28898 if (Idx == SM_SentinelUndef)
28899 return DAG.getUNDEF(EltVT);
28901 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28902 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28905 // If inputs to shuffle are the same for both ops, then allow 2 uses
28906 unsigned AllowedUses =
28907 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28909 if (LdNode.getOpcode() == ISD::BITCAST) {
28910 // Don't duplicate a load with other uses.
28911 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28914 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28915 LdNode = LdNode.getOperand(0);
28918 if (!ISD::isNormalLoad(LdNode.getNode()))
28921 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28923 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28926 // If there's a bitcast before the shuffle, check if the load type and
28927 // alignment is valid.
28928 unsigned Align = LN0->getAlignment();
28929 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28930 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28931 EltVT.getTypeForEVT(*DAG.getContext()));
28933 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28936 // All checks match so transform back to vector_shuffle so that DAG combiner
28937 // can finish the job
28940 // Create shuffle node taking into account the case that its a unary shuffle
28941 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28942 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28944 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28945 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28949 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28950 const X86Subtarget &Subtarget) {
28951 SDValue N0 = N->getOperand(0);
28952 EVT VT = N->getValueType(0);
28953 EVT SrcVT = N0.getValueType();
28955 // Since MMX types are special and don't usually play with other vector types,
28956 // it's better to handle them early to be sure we emit efficient code by
28957 // avoiding store-load conversions.
28959 // Detect bitcasts between i32 to x86mmx low word.
28960 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28961 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
28962 SDValue N00 = N0->getOperand(0);
28963 if (N00.getValueType() == MVT::i32)
28964 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28967 // Detect bitcasts between element or subvector extraction to x86mmx.
28968 if (VT == MVT::x86mmx &&
28969 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
28970 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
28971 isNullConstant(N0.getOperand(1))) {
28972 SDValue N00 = N0->getOperand(0);
28973 if (N00.getValueType().is128BitVector())
28974 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
28975 DAG.getBitcast(MVT::v2i64, N00));
28978 // Detect bitcasts from FP_TO_SINT to x86mmx.
28979 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
28980 N0.getOpcode() == ISD::FP_TO_SINT) {
28982 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
28983 DAG.getUNDEF(MVT::v2i32));
28984 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
28985 DAG.getBitcast(MVT::v2i64, Res));
28988 // Convert a bitcasted integer logic operation that has one bitcasted
28989 // floating-point operand into a floating-point logic operation. This may
28990 // create a load of a constant, but that is cheaper than materializing the
28991 // constant in an integer register and transferring it to an SSE register or
28992 // transferring the SSE operand to integer register and back.
28994 switch (N0.getOpcode()) {
28995 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28996 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28997 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28998 default: return SDValue();
29001 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29002 (Subtarget.hasSSE2() && VT == MVT::f64)))
29005 SDValue LogicOp0 = N0.getOperand(0);
29006 SDValue LogicOp1 = N0.getOperand(1);
29009 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29010 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29011 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29012 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29013 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29014 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29016 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29017 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29018 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29019 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29020 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29021 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29027 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29028 // the elements of a vector.
29029 // Returns the vector that is being reduced on, or SDValue() if a reduction
29030 // was not matched.
29031 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29032 // The pattern must end in an extract from index 0.
29033 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29034 !isNullConstant(Extract->getOperand(1)))
29038 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29040 SDValue Op = Extract->getOperand(0);
29041 // At each stage, we're looking for something that looks like:
29042 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29043 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29044 // i32 undef, i32 undef, i32 undef, i32 undef>
29045 // %a = binop <8 x i32> %op, %s
29046 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29047 // we expect something like:
29048 // <4,5,6,7,u,u,u,u>
29049 // <2,3,u,u,u,u,u,u>
29050 // <1,u,u,u,u,u,u,u>
29051 for (unsigned i = 0; i < Stages; ++i) {
29052 if (Op.getOpcode() != BinOp)
29055 ShuffleVectorSDNode *Shuffle =
29056 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29058 Op = Op.getOperand(1);
29060 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29061 Op = Op.getOperand(0);
29064 // The first operand of the shuffle should be the same as the other operand
29066 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29069 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29070 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29071 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29078 // Given a select, detect the following pattern:
29079 // 1: %2 = zext <N x i8> %0 to <N x i32>
29080 // 2: %3 = zext <N x i8> %1 to <N x i32>
29081 // 3: %4 = sub nsw <N x i32> %2, %3
29082 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29083 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29084 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29085 // This is useful as it is the input into a SAD pattern.
29086 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29088 // Check the condition of the select instruction is greater-than.
29089 SDValue SetCC = Select->getOperand(0);
29090 if (SetCC.getOpcode() != ISD::SETCC)
29092 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29093 if (CC != ISD::SETGT && CC != ISD::SETLT)
29096 SDValue SelectOp1 = Select->getOperand(1);
29097 SDValue SelectOp2 = Select->getOperand(2);
29099 // The following instructions assume SelectOp1 is the subtraction operand
29100 // and SelectOp2 is the negation operand.
29101 // In the case of SETLT this is the other way around.
29102 if (CC == ISD::SETLT)
29103 std::swap(SelectOp1, SelectOp2);
29105 // The second operand of the select should be the negation of the first
29106 // operand, which is implemented as 0 - SelectOp1.
29107 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29108 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29109 SelectOp2.getOperand(1) == SelectOp1))
29112 // The first operand of SetCC is the first operand of the select, which is the
29113 // difference between the two input vectors.
29114 if (SetCC.getOperand(0) != SelectOp1)
29117 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29119 if ((CC == ISD::SETLT) &&
29120 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29122 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29125 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29126 if ((CC == ISD::SETGT) &&
29127 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29128 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29131 // The first operand of the select is the difference between the two input
29133 if (SelectOp1.getOpcode() != ISD::SUB)
29136 Op0 = SelectOp1.getOperand(0);
29137 Op1 = SelectOp1.getOperand(1);
29139 // Check if the operands of the sub are zero-extended from vectors of i8.
29140 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29141 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29142 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29143 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29149 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29151 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29152 const SDValue &Zext1, const SDLoc &DL) {
29154 // Find the appropriate width for the PSADBW.
29155 EVT InVT = Zext0.getOperand(0).getValueType();
29156 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29158 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29159 // fill in the missing vector elements with 0.
29160 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29161 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29162 Ops[0] = Zext0.getOperand(0);
29163 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29164 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29165 Ops[0] = Zext1.getOperand(0);
29166 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29168 // Actually build the SAD
29169 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29170 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29173 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29174 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29176 const X86Subtarget &Subtarget) {
29177 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29178 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29181 EVT ExtractVT = Extract->getValueType(0);
29182 unsigned BitWidth = ExtractVT.getSizeInBits();
29183 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29184 ExtractVT != MVT::i8)
29187 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29188 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29189 SDValue Match = matchBinOpReduction(Extract, Op);
29193 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29194 // which we can't support here for now.
29195 if (Match.getScalarValueSizeInBits() != BitWidth)
29198 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29199 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29200 if (!(MatchSizeInBits == 128 ||
29201 (MatchSizeInBits == 256 &&
29202 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29205 // Don't bother performing this for 2-element vectors.
29206 if (Match.getValueType().getVectorNumElements() <= 2)
29209 // Check that we are extracting a reduction of all sign bits.
29210 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29213 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29215 if (64 == BitWidth || 32 == BitWidth)
29216 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29217 MatchSizeInBits / BitWidth);
29219 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29222 ISD::CondCode CondCode;
29223 if (Op == ISD::OR) {
29224 // any_of -> MOVMSK != 0
29225 CompareBits = APInt::getNullValue(32);
29226 CondCode = ISD::CondCode::SETNE;
29228 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29229 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29230 CondCode = ISD::CondCode::SETEQ;
29233 // Perform the select as i32/i64 and then truncate to avoid partial register
29235 unsigned ResWidth = std::max(BitWidth, 32u);
29236 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29238 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29239 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29240 SDValue Res = DAG.getBitcast(MaskVT, Match);
29241 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29242 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29243 Ones, Zero, CondCode);
29244 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29250 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29251 const X86Subtarget &Subtarget) {
29252 // PSADBW is only supported on SSE2 and up.
29253 if (!Subtarget.hasSSE2())
29256 // Verify the type we're extracting from is any integer type above i16.
29257 EVT VT = Extract->getOperand(0).getValueType();
29258 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29261 unsigned RegSize = 128;
29262 if (Subtarget.hasBWI())
29264 else if (Subtarget.hasAVX2())
29267 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29268 // TODO: We should be able to handle larger vectors by splitting them before
29269 // feeding them into several SADs, and then reducing over those.
29270 if (RegSize / VT.getVectorNumElements() < 8)
29273 // Match shuffle + add pyramid.
29274 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29276 // The operand is expected to be zero extended from i8
29277 // (verified in detectZextAbsDiff).
29278 // In order to convert to i64 and above, additional any/zero/sign
29279 // extend is expected.
29280 // The zero extend from 32 bit has no mathematical effect on the result.
29281 // Also the sign extend is basically zero extend
29282 // (extends the sign bit which is zero).
29283 // So it is correct to skip the sign/zero extend instruction.
29284 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29285 Root.getOpcode() == ISD::ZERO_EXTEND ||
29286 Root.getOpcode() == ISD::ANY_EXTEND))
29287 Root = Root.getOperand(0);
29289 // If there was a match, we want Root to be a select that is the root of an
29290 // abs-diff pattern.
29291 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29294 // Check whether we have an abs-diff pattern feeding into the select.
29295 SDValue Zext0, Zext1;
29296 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29299 // Create the SAD instruction.
29301 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29303 // If the original vector was wider than 8 elements, sum over the results
29304 // in the SAD vector.
29305 unsigned Stages = Log2_32(VT.getVectorNumElements());
29306 MVT SadVT = SAD.getSimpleValueType();
29308 unsigned SadElems = SadVT.getVectorNumElements();
29310 for(unsigned i = Stages - 3; i > 0; --i) {
29311 SmallVector<int, 16> Mask(SadElems, -1);
29312 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29313 Mask[j] = MaskEnd + j;
29316 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29317 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29321 MVT Type = Extract->getSimpleValueType(0);
29322 unsigned TypeSizeInBits = Type.getSizeInBits();
29323 // Return the lowest TypeSizeInBits bits.
29324 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29325 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29326 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29327 Extract->getOperand(1));
29330 // Attempt to peek through a target shuffle and extract the scalar from the
29332 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29333 TargetLowering::DAGCombinerInfo &DCI,
29334 const X86Subtarget &Subtarget) {
29335 if (DCI.isBeforeLegalizeOps())
29338 SDValue Src = N->getOperand(0);
29339 SDValue Idx = N->getOperand(1);
29341 EVT VT = N->getValueType(0);
29342 EVT SrcVT = Src.getValueType();
29343 EVT SrcSVT = SrcVT.getVectorElementType();
29344 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29346 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29347 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29350 // Resolve the target shuffle inputs and mask.
29351 SmallVector<int, 16> Mask;
29352 SmallVector<SDValue, 2> Ops;
29353 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29356 // Attempt to narrow/widen the shuffle mask to the correct size.
29357 if (Mask.size() != NumSrcElts) {
29358 if ((NumSrcElts % Mask.size()) == 0) {
29359 SmallVector<int, 16> ScaledMask;
29360 int Scale = NumSrcElts / Mask.size();
29361 scaleShuffleMask(Scale, Mask, ScaledMask);
29362 Mask = std::move(ScaledMask);
29363 } else if ((Mask.size() % NumSrcElts) == 0) {
29364 SmallVector<int, 16> WidenedMask;
29365 while (Mask.size() > NumSrcElts &&
29366 canWidenShuffleElements(Mask, WidenedMask))
29367 Mask = std::move(WidenedMask);
29368 // TODO - investigate support for wider shuffle masks with known upper
29369 // undef/zero elements for implicit zero-extension.
29373 // Check if narrowing/widening failed.
29374 if (Mask.size() != NumSrcElts)
29377 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29380 // If the shuffle source element is undef/zero then we can just accept it.
29381 if (SrcIdx == SM_SentinelUndef)
29382 return DAG.getUNDEF(VT);
29384 if (SrcIdx == SM_SentinelZero)
29385 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29386 : DAG.getConstant(0, dl, VT);
29388 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29389 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29390 SrcIdx = SrcIdx % Mask.size();
29392 // We can only extract other elements from 128-bit vectors and in certain
29393 // circumstances, depending on SSE-level.
29394 // TODO: Investigate using extract_subvector for larger vectors.
29395 // TODO: Investigate float/double extraction if it will be just stored.
29396 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29397 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29398 assert(SrcSVT == VT && "Unexpected extraction type");
29399 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29400 DAG.getIntPtrConstant(SrcIdx, dl));
29403 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29404 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29405 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29406 "Unexpected extraction type");
29407 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29408 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29409 DAG.getIntPtrConstant(SrcIdx, dl));
29410 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29411 DAG.getValueType(SrcSVT));
29412 return DAG.getZExtOrTrunc(Assert, dl, VT);
29418 /// Detect vector gather/scatter index generation and convert it from being a
29419 /// bunch of shuffles and extracts into a somewhat faster sequence.
29420 /// For i686, the best sequence is apparently storing the value and loading
29421 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29422 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29423 TargetLowering::DAGCombinerInfo &DCI,
29424 const X86Subtarget &Subtarget) {
29425 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29428 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29431 SDValue InputVector = N->getOperand(0);
29432 SDValue EltIdx = N->getOperand(1);
29434 EVT SrcVT = InputVector.getValueType();
29435 EVT VT = N->getValueType(0);
29436 SDLoc dl(InputVector);
29438 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29439 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29440 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29441 SDValue MMXSrc = InputVector.getOperand(0);
29443 // The bitcast source is a direct mmx result.
29444 if (MMXSrc.getValueType() == MVT::x86mmx)
29445 return DAG.getBitcast(VT, InputVector);
29448 // Detect mmx to i32 conversion through a v2i32 elt extract.
29449 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29450 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29451 SDValue MMXSrc = InputVector.getOperand(0);
29453 // The bitcast source is a direct mmx result.
29454 if (MMXSrc.getValueType() == MVT::x86mmx)
29455 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29458 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29459 isa<ConstantSDNode>(EltIdx) &&
29460 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29461 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29462 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29463 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29464 return DAG.getConstant(Res, dl, MVT::i1);
29467 // Check whether this extract is the root of a sum of absolute differences
29468 // pattern. This has to be done here because we really want it to happen
29469 // pre-legalization,
29470 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29473 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29474 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29477 // Only operate on vectors of 4 elements, where the alternative shuffling
29478 // gets to be more expensive.
29479 if (SrcVT != MVT::v4i32)
29482 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29483 // single use which is a sign-extend or zero-extend, and all elements are
29485 SmallVector<SDNode *, 4> Uses;
29486 unsigned ExtractedElements = 0;
29487 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29488 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29489 if (UI.getUse().getResNo() != InputVector.getResNo())
29492 SDNode *Extract = *UI;
29493 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29496 if (Extract->getValueType(0) != MVT::i32)
29498 if (!Extract->hasOneUse())
29500 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29501 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29503 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29506 // Record which element was extracted.
29507 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29508 Uses.push_back(Extract);
29511 // If not all the elements were used, this may not be worthwhile.
29512 if (ExtractedElements != 15)
29515 // Ok, we've now decided to do the transformation.
29516 // If 64-bit shifts are legal, use the extract-shift sequence,
29517 // otherwise bounce the vector off the cache.
29518 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29521 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29522 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29523 auto &DL = DAG.getDataLayout();
29524 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29525 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29526 DAG.getConstant(0, dl, VecIdxTy));
29527 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29528 DAG.getConstant(1, dl, VecIdxTy));
29530 SDValue ShAmt = DAG.getConstant(
29531 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29532 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29533 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29534 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29535 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29536 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29537 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29539 // Store the value to a temporary stack slot.
29540 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29541 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29542 MachinePointerInfo());
29544 EVT ElementType = SrcVT.getVectorElementType();
29545 unsigned EltSize = ElementType.getSizeInBits() / 8;
29547 // Replace each use (extract) with a load of the appropriate element.
29548 for (unsigned i = 0; i < 4; ++i) {
29549 uint64_t Offset = EltSize * i;
29550 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29551 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29553 SDValue ScalarAddr =
29554 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29556 // Load the scalar.
29558 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29562 // Replace the extracts
29563 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29564 UE = Uses.end(); UI != UE; ++UI) {
29565 SDNode *Extract = *UI;
29567 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29568 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29571 // The replacement was made in place; don't return anything.
29575 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29576 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29577 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29578 // combineBasicSADPattern.
29579 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29580 TargetLowering::DAGCombinerInfo &DCI,
29581 const X86Subtarget &Subtarget) {
29582 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29585 /// If a vector select has an operand that is -1 or 0, try to simplify the
29586 /// select to a bitwise logic operation.
29588 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29589 TargetLowering::DAGCombinerInfo &DCI,
29590 const X86Subtarget &Subtarget) {
29591 SDValue Cond = N->getOperand(0);
29592 SDValue LHS = N->getOperand(1);
29593 SDValue RHS = N->getOperand(2);
29594 EVT VT = LHS.getValueType();
29595 EVT CondVT = Cond.getValueType();
29597 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29599 if (N->getOpcode() != ISD::VSELECT)
29602 assert(CondVT.isVector() && "Vector select expects a vector selector!");
29604 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29605 // Check if the first operand is all zeros and Cond type is vXi1.
29606 // This situation only applies to avx512.
29607 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29608 CondVT.getVectorElementType() == MVT::i1) {
29609 // Invert the cond to not(cond) : xor(op,allones)=not(op)
29610 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29611 DAG.getAllOnesConstant(DL, CondVT));
29612 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29613 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
29616 // To use the condition operand as a bitwise mask, it must have elements that
29617 // are the same size as the select elements. Ie, the condition operand must
29618 // have already been promoted from the IR select condition type <N x i1>.
29619 // Don't check if the types themselves are equal because that excludes
29620 // vector floating-point selects.
29621 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29624 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29625 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29627 // Try to invert the condition if true value is not all 1s and false value is
29629 if (!TValIsAllOnes && !FValIsAllZeros &&
29630 // Check if the selector will be produced by CMPP*/PCMP*.
29631 Cond.getOpcode() == ISD::SETCC &&
29632 // Check if SETCC has already been promoted.
29633 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29635 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29636 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29638 if (TValIsAllZeros || FValIsAllOnes) {
29639 SDValue CC = Cond.getOperand(2);
29640 ISD::CondCode NewCC =
29641 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29642 Cond.getOperand(0).getValueType().isInteger());
29643 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29645 std::swap(LHS, RHS);
29646 TValIsAllOnes = FValIsAllOnes;
29647 FValIsAllZeros = TValIsAllZeros;
29651 // vselect Cond, 111..., 000... -> Cond
29652 if (TValIsAllOnes && FValIsAllZeros)
29653 return DAG.getBitcast(VT, Cond);
29655 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29658 // vselect Cond, 111..., X -> or Cond, X
29659 if (TValIsAllOnes) {
29660 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29661 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29662 return DAG.getBitcast(VT, Or);
29665 // vselect Cond, X, 000... -> and Cond, X
29666 if (FValIsAllZeros) {
29667 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29668 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29669 return DAG.getBitcast(VT, And);
29675 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29676 SDValue Cond = N->getOperand(0);
29677 SDValue LHS = N->getOperand(1);
29678 SDValue RHS = N->getOperand(2);
29681 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29682 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29683 if (!TrueC || !FalseC)
29686 // Don't do this for crazy integer types.
29687 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29690 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29691 // so that TrueC (the true value) is larger than FalseC.
29692 bool NeedsCondInvert = false;
29693 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29694 // Efficiently invertible.
29695 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29696 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
29697 isa<ConstantSDNode>(Cond.getOperand(1))))) {
29698 NeedsCondInvert = true;
29699 std::swap(TrueC, FalseC);
29702 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
29703 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29704 if (NeedsCondInvert) // Invert the condition if needed.
29705 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29706 DAG.getConstant(1, DL, Cond.getValueType()));
29708 // Zero extend the condition if needed.
29709 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29711 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29712 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29713 DAG.getConstant(ShAmt, DL, MVT::i8));
29716 // Optimize cases that will turn into an LEA instruction. This requires
29717 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29718 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29719 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29720 if (N->getValueType(0) == MVT::i32)
29721 Diff = (unsigned)Diff;
29723 bool isFastMultiplier = false;
29725 switch ((unsigned char)Diff) {
29728 case 1: // result = add base, cond
29729 case 2: // result = lea base( , cond*2)
29730 case 3: // result = lea base(cond, cond*2)
29731 case 4: // result = lea base( , cond*4)
29732 case 5: // result = lea base(cond, cond*4)
29733 case 8: // result = lea base( , cond*8)
29734 case 9: // result = lea base(cond, cond*8)
29735 isFastMultiplier = true;
29740 if (isFastMultiplier) {
29741 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29742 if (NeedsCondInvert) // Invert the condition if needed.
29743 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29744 DAG.getConstant(1, DL, Cond.getValueType()));
29746 // Zero extend the condition if needed.
29747 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29748 // Scale the condition by the difference.
29750 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29751 DAG.getConstant(Diff, DL, Cond.getValueType()));
29753 // Add the base if non-zero.
29754 if (FalseC->getAPIntValue() != 0)
29755 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29756 SDValue(FalseC, 0));
29764 // If this is a bitcasted op that can be represented as another type, push the
29765 // the bitcast to the inputs. This allows more opportunities for pattern
29766 // matching masked instructions. This is called when we know that the operation
29767 // is used as one of the inputs of a vselect.
29768 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29769 TargetLowering::DAGCombinerInfo &DCI) {
29770 // Make sure we have a bitcast.
29771 if (OrigOp.getOpcode() != ISD::BITCAST)
29774 SDValue Op = OrigOp.getOperand(0);
29776 // If the operation is used by anything other than the bitcast, we shouldn't
29777 // do this combine as that would replicate the operation.
29778 if (!Op.hasOneUse())
29781 MVT VT = OrigOp.getSimpleValueType();
29782 MVT EltVT = VT.getVectorElementType();
29783 SDLoc DL(Op.getNode());
29785 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29787 Op0 = DAG.getBitcast(VT, Op0);
29788 DCI.AddToWorklist(Op0.getNode());
29789 Op1 = DAG.getBitcast(VT, Op1);
29790 DCI.AddToWorklist(Op1.getNode());
29791 DCI.CombineTo(OrigOp.getNode(),
29792 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29796 unsigned Opcode = Op.getOpcode();
29798 case X86ISD::PALIGNR:
29799 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29800 if (!VT.is128BitVector())
29802 Opcode = X86ISD::VALIGN;
29804 case X86ISD::VALIGN: {
29805 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29807 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29808 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29809 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29810 unsigned EltSize = EltVT.getSizeInBits();
29811 // Make sure we can represent the same shift with the new VT.
29812 if ((ShiftAmt % EltSize) != 0)
29814 Imm = ShiftAmt / EltSize;
29815 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29816 DAG.getConstant(Imm, DL, MVT::i8));
29818 case X86ISD::SHUF128: {
29819 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29821 // Only change element size, not type.
29822 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29824 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29827 case ISD::INSERT_SUBVECTOR: {
29828 unsigned EltSize = EltVT.getSizeInBits();
29829 if (EltSize != 32 && EltSize != 64)
29831 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29832 // Only change element size, not type.
29833 if (EltVT.isInteger() != OpEltVT.isInteger())
29835 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29836 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29837 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29838 DCI.AddToWorklist(Op0.getNode());
29839 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29840 SDValue Op1 = Op.getOperand(1);
29841 MVT Op1VT = MVT::getVectorVT(EltVT,
29842 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29843 Op1 = DAG.getBitcast(Op1VT, Op1);
29844 DCI.AddToWorklist(Op1.getNode());
29845 DCI.CombineTo(OrigOp.getNode(),
29846 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29847 DAG.getIntPtrConstant(Imm, DL)));
29850 case ISD::EXTRACT_SUBVECTOR: {
29851 unsigned EltSize = EltVT.getSizeInBits();
29852 if (EltSize != 32 && EltSize != 64)
29854 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29855 // Only change element size, not type.
29856 if (EltVT.isInteger() != OpEltVT.isInteger())
29858 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29859 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29860 // Op0 needs to be bitcasted to a larger vector with the same element type.
29861 SDValue Op0 = Op.getOperand(0);
29862 MVT Op0VT = MVT::getVectorVT(EltVT,
29863 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29864 Op0 = DAG.getBitcast(Op0VT, Op0);
29865 DCI.AddToWorklist(Op0.getNode());
29866 DCI.CombineTo(OrigOp.getNode(),
29867 DAG.getNode(Opcode, DL, VT, Op0,
29868 DAG.getIntPtrConstant(Imm, DL)));
29871 case X86ISD::SUBV_BROADCAST: {
29872 unsigned EltSize = EltVT.getSizeInBits();
29873 if (EltSize != 32 && EltSize != 64)
29875 // Only change element size, not type.
29876 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29878 SDValue Op0 = Op.getOperand(0);
29879 MVT Op0VT = MVT::getVectorVT(EltVT,
29880 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29881 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29882 DCI.AddToWorklist(Op0.getNode());
29883 DCI.CombineTo(OrigOp.getNode(),
29884 DAG.getNode(Opcode, DL, VT, Op0));
29892 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29893 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29894 TargetLowering::DAGCombinerInfo &DCI,
29895 const X86Subtarget &Subtarget) {
29897 SDValue Cond = N->getOperand(0);
29898 // Get the LHS/RHS of the select.
29899 SDValue LHS = N->getOperand(1);
29900 SDValue RHS = N->getOperand(2);
29901 EVT VT = LHS.getValueType();
29902 EVT CondVT = Cond.getValueType();
29903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29905 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29906 // instructions match the semantics of the common C idiom x<y?x:y but not
29907 // x<=y?x:y, because of how they handle negative zero (which can be
29908 // ignored in unsafe-math mode).
29909 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29910 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29911 VT != MVT::f80 && VT != MVT::f128 &&
29912 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29913 (Subtarget.hasSSE2() ||
29914 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29915 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29917 unsigned Opcode = 0;
29918 // Check for x CC y ? x : y.
29919 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29920 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29924 // Converting this to a min would handle NaNs incorrectly, and swapping
29925 // the operands would cause it to handle comparisons between positive
29926 // and negative zero incorrectly.
29927 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29928 if (!DAG.getTarget().Options.UnsafeFPMath &&
29929 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29931 std::swap(LHS, RHS);
29933 Opcode = X86ISD::FMIN;
29936 // Converting this to a min would handle comparisons between positive
29937 // and negative zero incorrectly.
29938 if (!DAG.getTarget().Options.UnsafeFPMath &&
29939 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29941 Opcode = X86ISD::FMIN;
29944 // Converting this to a min would handle both negative zeros and NaNs
29945 // incorrectly, but we can swap the operands to fix both.
29946 std::swap(LHS, RHS);
29950 Opcode = X86ISD::FMIN;
29954 // Converting this to a max would handle comparisons between positive
29955 // and negative zero incorrectly.
29956 if (!DAG.getTarget().Options.UnsafeFPMath &&
29957 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29959 Opcode = X86ISD::FMAX;
29962 // Converting this to a max would handle NaNs incorrectly, and swapping
29963 // the operands would cause it to handle comparisons between positive
29964 // and negative zero incorrectly.
29965 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29966 if (!DAG.getTarget().Options.UnsafeFPMath &&
29967 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29969 std::swap(LHS, RHS);
29971 Opcode = X86ISD::FMAX;
29974 // Converting this to a max would handle both negative zeros and NaNs
29975 // incorrectly, but we can swap the operands to fix both.
29976 std::swap(LHS, RHS);
29980 Opcode = X86ISD::FMAX;
29983 // Check for x CC y ? y : x -- a min/max with reversed arms.
29984 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29985 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29989 // Converting this to a min would handle comparisons between positive
29990 // and negative zero incorrectly, and swapping the operands would
29991 // cause it to handle NaNs incorrectly.
29992 if (!DAG.getTarget().Options.UnsafeFPMath &&
29993 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29994 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29996 std::swap(LHS, RHS);
29998 Opcode = X86ISD::FMIN;
30001 // Converting this to a min would handle NaNs incorrectly.
30002 if (!DAG.getTarget().Options.UnsafeFPMath &&
30003 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30005 Opcode = X86ISD::FMIN;
30008 // Converting this to a min would handle both negative zeros and NaNs
30009 // incorrectly, but we can swap the operands to fix both.
30010 std::swap(LHS, RHS);
30014 Opcode = X86ISD::FMIN;
30018 // Converting this to a max would handle NaNs incorrectly.
30019 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30021 Opcode = X86ISD::FMAX;
30024 // Converting this to a max would handle comparisons between positive
30025 // and negative zero incorrectly, and swapping the operands would
30026 // cause it to handle NaNs incorrectly.
30027 if (!DAG.getTarget().Options.UnsafeFPMath &&
30028 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30029 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30031 std::swap(LHS, RHS);
30033 Opcode = X86ISD::FMAX;
30036 // Converting this to a max would handle both negative zeros and NaNs
30037 // incorrectly, but we can swap the operands to fix both.
30038 std::swap(LHS, RHS);
30042 Opcode = X86ISD::FMAX;
30048 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30051 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30052 // lowering on KNL. In this case we convert it to
30053 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30054 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30055 // Since SKX these selects have a proper lowering.
30056 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30057 CondVT.getVectorElementType() == MVT::i1 &&
30058 (VT.is128BitVector() || VT.is256BitVector()) &&
30059 (VT.getVectorElementType() == MVT::i8 ||
30060 VT.getVectorElementType() == MVT::i16) &&
30061 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30062 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30063 DCI.AddToWorklist(Cond.getNode());
30064 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30067 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30070 // Canonicalize max and min:
30071 // (x > y) ? x : y -> (x >= y) ? x : y
30072 // (x < y) ? x : y -> (x <= y) ? x : y
30073 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30074 // the need for an extra compare
30075 // against zero. e.g.
30076 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30078 // testl %edi, %edi
30080 // cmovgl %edi, %eax
30084 // cmovsl %eax, %edi
30085 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30086 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30087 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30088 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30093 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30094 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30095 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30096 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
30101 // Early exit check
30102 if (!TLI.isTypeLegal(VT))
30105 // Match VSELECTs into subs with unsigned saturation.
30106 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30107 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30108 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30109 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30110 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30112 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30113 // left side invert the predicate to simplify logic below.
30115 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30117 CC = ISD::getSetCCInverse(CC, true);
30118 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30122 if (Other.getNode() && Other->getNumOperands() == 2 &&
30123 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30124 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30125 SDValue CondRHS = Cond->getOperand(1);
30127 // Look for a general sub with unsigned saturation first.
30128 // x >= y ? x-y : 0 --> subus x, y
30129 // x > y ? x-y : 0 --> subus x, y
30130 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30131 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30132 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30134 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30135 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30136 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30137 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30138 // If the RHS is a constant we have to reverse the const
30139 // canonicalization.
30140 // x > C-1 ? x+-C : 0 --> subus x, C
30141 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30142 CondRHSConst->getAPIntValue() ==
30143 (-OpRHSConst->getAPIntValue() - 1))
30144 return DAG.getNode(
30145 X86ISD::SUBUS, DL, VT, OpLHS,
30146 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30148 // Another special case: If C was a sign bit, the sub has been
30149 // canonicalized into a xor.
30150 // FIXME: Would it be better to use computeKnownBits to determine
30151 // whether it's safe to decanonicalize the xor?
30152 // x s< 0 ? x^C : 0 --> subus x, C
30153 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30154 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30155 OpRHSConst->getAPIntValue().isSignMask())
30156 // Note that we have to rebuild the RHS constant here to ensure we
30157 // don't rely on particular values of undef lanes.
30158 return DAG.getNode(
30159 X86ISD::SUBUS, DL, VT, OpLHS,
30160 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30165 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30168 // If this is a *dynamic* select (non-constant condition) and we can match
30169 // this node with one of the variable blend instructions, restructure the
30170 // condition so that blends can use the high (sign) bit of each element and
30171 // use SimplifyDemandedBits to simplify the condition operand.
30172 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30173 !DCI.isBeforeLegalize() &&
30174 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30175 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30177 // Don't optimize vector selects that map to mask-registers.
30181 // We can only handle the cases where VSELECT is directly legal on the
30182 // subtarget. We custom lower VSELECT nodes with constant conditions and
30183 // this makes it hard to see whether a dynamic VSELECT will correctly
30184 // lower, so we both check the operation's status and explicitly handle the
30185 // cases where a *dynamic* blend will fail even though a constant-condition
30186 // blend could be custom lowered.
30187 // FIXME: We should find a better way to handle this class of problems.
30188 // Potentially, we should combine constant-condition vselect nodes
30189 // pre-legalization into shuffles and not mark as many types as custom
30191 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30193 // FIXME: We don't support i16-element blends currently. We could and
30194 // should support them by making *all* the bits in the condition be set
30195 // rather than just the high bit and using an i8-element blend.
30196 if (VT.getVectorElementType() == MVT::i16)
30198 // Dynamic blending was only available from SSE4.1 onward.
30199 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30201 // Byte blends are only available in AVX2
30202 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30205 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30206 APInt DemandedMask(APInt::getSignMask(BitWidth));
30207 APInt KnownZero, KnownOne;
30208 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30209 DCI.isBeforeLegalizeOps());
30210 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
30211 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
30213 // If we changed the computation somewhere in the DAG, this change will
30214 // affect all users of Cond. Make sure it is fine and update all the nodes
30215 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30216 // perform wrong optimizations as we messed with the actual expectation
30217 // for the vector boolean values.
30218 if (Cond != TLO.Old) {
30219 // Check all uses of the condition operand to check whether it will be
30220 // consumed by non-BLEND instructions. Those may require that all bits
30221 // are set properly.
30222 for (SDNode *U : Cond->uses()) {
30223 // TODO: Add other opcodes eventually lowered into BLEND.
30224 if (U->getOpcode() != ISD::VSELECT)
30228 // Update all users of the condition before committing the change, so
30229 // that the VSELECT optimizations that expect the correct vector boolean
30230 // value will not be triggered.
30231 for (SDNode *U : Cond->uses()) {
30232 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30233 U->getValueType(0), Cond, U->getOperand(1),
30235 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30237 DCI.CommitTargetLoweringOpt(TLO);
30240 // Only Cond (rather than other nodes in the computation chain) was
30241 // changed. Change the condition just for N to keep the opportunity to
30242 // optimize all other users their own way.
30243 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30244 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30249 // Look for vselects with LHS/RHS being bitcasted from an operation that
30250 // can be executed on another type. Push the bitcast to the inputs of
30251 // the operation. This exposes opportunities for using masking instructions.
30252 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30253 CondVT.getVectorElementType() == MVT::i1) {
30254 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30255 return SDValue(N, 0);
30256 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30257 return SDValue(N, 0);
30264 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30266 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30267 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30268 /// Note that this is only legal for some op/cc combinations.
30269 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30270 SelectionDAG &DAG) {
30271 // This combine only operates on CMP-like nodes.
30272 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30273 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30276 // Can't replace the cmp if it has more uses than the one we're looking at.
30277 // FIXME: We would like to be able to handle this, but would need to make sure
30278 // all uses were updated.
30279 if (!Cmp.hasOneUse())
30282 // This only applies to variations of the common case:
30283 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30284 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30285 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30286 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30287 // Using the proper condcodes (see below), overflow is checked for.
30289 // FIXME: We can generalize both constraints:
30290 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30292 // if the result is compared.
30294 SDValue CmpLHS = Cmp.getOperand(0);
30295 SDValue CmpRHS = Cmp.getOperand(1);
30297 if (!CmpLHS.hasOneUse())
30300 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30301 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30304 const unsigned Opc = CmpLHS.getOpcode();
30306 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30309 SDValue OpRHS = CmpLHS.getOperand(2);
30310 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30314 APInt Addend = OpRHSC->getAPIntValue();
30315 if (Opc == ISD::ATOMIC_LOAD_SUB)
30318 if (CC == X86::COND_S && Addend == 1)
30320 else if (CC == X86::COND_NS && Addend == 1)
30322 else if (CC == X86::COND_G && Addend == -1)
30324 else if (CC == X86::COND_LE && Addend == -1)
30329 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30330 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30331 DAG.getUNDEF(CmpLHS.getValueType()));
30332 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30336 // Check whether a boolean test is testing a boolean value generated by
30337 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30340 // Simplify the following patterns:
30341 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30342 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30343 // to (Op EFLAGS Cond)
30345 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30346 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30347 // to (Op EFLAGS !Cond)
30349 // where Op could be BRCOND or CMOV.
30351 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30352 // This combine only operates on CMP-like nodes.
30353 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30354 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30357 // Quit if not used as a boolean value.
30358 if (CC != X86::COND_E && CC != X86::COND_NE)
30361 // Check CMP operands. One of them should be 0 or 1 and the other should be
30362 // an SetCC or extended from it.
30363 SDValue Op1 = Cmp.getOperand(0);
30364 SDValue Op2 = Cmp.getOperand(1);
30367 const ConstantSDNode* C = nullptr;
30368 bool needOppositeCond = (CC == X86::COND_E);
30369 bool checkAgainstTrue = false; // Is it a comparison against 1?
30371 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30373 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30375 else // Quit if all operands are not constants.
30378 if (C->getZExtValue() == 1) {
30379 needOppositeCond = !needOppositeCond;
30380 checkAgainstTrue = true;
30381 } else if (C->getZExtValue() != 0)
30382 // Quit if the constant is neither 0 or 1.
30385 bool truncatedToBoolWithAnd = false;
30386 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30387 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30388 SetCC.getOpcode() == ISD::TRUNCATE ||
30389 SetCC.getOpcode() == ISD::AND) {
30390 if (SetCC.getOpcode() == ISD::AND) {
30392 if (isOneConstant(SetCC.getOperand(0)))
30394 if (isOneConstant(SetCC.getOperand(1)))
30398 SetCC = SetCC.getOperand(OpIdx);
30399 truncatedToBoolWithAnd = true;
30401 SetCC = SetCC.getOperand(0);
30404 switch (SetCC.getOpcode()) {
30405 case X86ISD::SETCC_CARRY:
30406 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30407 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30408 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30409 // truncated to i1 using 'and'.
30410 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30412 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30413 "Invalid use of SETCC_CARRY!");
30415 case X86ISD::SETCC:
30416 // Set the condition code or opposite one if necessary.
30417 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30418 if (needOppositeCond)
30419 CC = X86::GetOppositeBranchCondition(CC);
30420 return SetCC.getOperand(1);
30421 case X86ISD::CMOV: {
30422 // Check whether false/true value has canonical one, i.e. 0 or 1.
30423 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30424 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30425 // Quit if true value is not a constant.
30428 // Quit if false value is not a constant.
30430 SDValue Op = SetCC.getOperand(0);
30431 // Skip 'zext' or 'trunc' node.
30432 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30433 Op.getOpcode() == ISD::TRUNCATE)
30434 Op = Op.getOperand(0);
30435 // A special case for rdrand/rdseed, where 0 is set if false cond is
30437 if ((Op.getOpcode() != X86ISD::RDRAND &&
30438 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30441 // Quit if false value is not the constant 0 or 1.
30442 bool FValIsFalse = true;
30443 if (FVal && FVal->getZExtValue() != 0) {
30444 if (FVal->getZExtValue() != 1)
30446 // If FVal is 1, opposite cond is needed.
30447 needOppositeCond = !needOppositeCond;
30448 FValIsFalse = false;
30450 // Quit if TVal is not the constant opposite of FVal.
30451 if (FValIsFalse && TVal->getZExtValue() != 1)
30453 if (!FValIsFalse && TVal->getZExtValue() != 0)
30455 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30456 if (needOppositeCond)
30457 CC = X86::GetOppositeBranchCondition(CC);
30458 return SetCC.getOperand(3);
30465 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30467 /// (X86or (X86setcc) (X86setcc))
30468 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30469 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30470 X86::CondCode &CC1, SDValue &Flags,
30472 if (Cond->getOpcode() == X86ISD::CMP) {
30473 if (!isNullConstant(Cond->getOperand(1)))
30476 Cond = Cond->getOperand(0);
30481 SDValue SetCC0, SetCC1;
30482 switch (Cond->getOpcode()) {
30483 default: return false;
30490 SetCC0 = Cond->getOperand(0);
30491 SetCC1 = Cond->getOperand(1);
30495 // Make sure we have SETCC nodes, using the same flags value.
30496 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30497 SetCC1.getOpcode() != X86ISD::SETCC ||
30498 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30501 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30502 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30503 Flags = SetCC0->getOperand(1);
30507 /// Optimize an EFLAGS definition used according to the condition code \p CC
30508 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30509 /// uses of chain values.
30510 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30511 SelectionDAG &DAG) {
30512 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30514 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30517 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30518 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30519 TargetLowering::DAGCombinerInfo &DCI,
30520 const X86Subtarget &Subtarget) {
30523 // If the flag operand isn't dead, don't touch this CMOV.
30524 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30527 SDValue FalseOp = N->getOperand(0);
30528 SDValue TrueOp = N->getOperand(1);
30529 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30530 SDValue Cond = N->getOperand(3);
30532 if (CC == X86::COND_E || CC == X86::COND_NE) {
30533 switch (Cond.getOpcode()) {
30537 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30538 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30539 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30543 // Try to simplify the EFLAGS and condition code operands.
30544 // We can't always do this as FCMOV only supports a subset of X86 cond.
30545 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30546 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30547 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30549 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30553 // If this is a select between two integer constants, try to do some
30554 // optimizations. Note that the operands are ordered the opposite of SELECT
30556 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30557 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30558 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30559 // larger than FalseC (the false value).
30560 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30561 CC = X86::GetOppositeBranchCondition(CC);
30562 std::swap(TrueC, FalseC);
30563 std::swap(TrueOp, FalseOp);
30566 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30567 // This is efficient for any integer data type (including i8/i16) and
30569 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30570 Cond = getSETCC(CC, Cond, DL, DAG);
30572 // Zero extend the condition if needed.
30573 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30575 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30576 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30577 DAG.getConstant(ShAmt, DL, MVT::i8));
30578 if (N->getNumValues() == 2) // Dead flag value?
30579 return DCI.CombineTo(N, Cond, SDValue());
30583 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30584 // for any integer data type, including i8/i16.
30585 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30586 Cond = getSETCC(CC, Cond, DL, DAG);
30588 // Zero extend the condition if needed.
30589 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30590 FalseC->getValueType(0), Cond);
30591 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30592 SDValue(FalseC, 0));
30594 if (N->getNumValues() == 2) // Dead flag value?
30595 return DCI.CombineTo(N, Cond, SDValue());
30599 // Optimize cases that will turn into an LEA instruction. This requires
30600 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30601 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30602 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30603 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30605 bool isFastMultiplier = false;
30607 switch ((unsigned char)Diff) {
30609 case 1: // result = add base, cond
30610 case 2: // result = lea base( , cond*2)
30611 case 3: // result = lea base(cond, cond*2)
30612 case 4: // result = lea base( , cond*4)
30613 case 5: // result = lea base(cond, cond*4)
30614 case 8: // result = lea base( , cond*8)
30615 case 9: // result = lea base(cond, cond*8)
30616 isFastMultiplier = true;
30621 if (isFastMultiplier) {
30622 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30623 Cond = getSETCC(CC, Cond, DL ,DAG);
30624 // Zero extend the condition if needed.
30625 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30627 // Scale the condition by the difference.
30629 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30630 DAG.getConstant(Diff, DL, Cond.getValueType()));
30632 // Add the base if non-zero.
30633 if (FalseC->getAPIntValue() != 0)
30634 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30635 SDValue(FalseC, 0));
30636 if (N->getNumValues() == 2) // Dead flag value?
30637 return DCI.CombineTo(N, Cond, SDValue());
30644 // Handle these cases:
30645 // (select (x != c), e, c) -> select (x != c), e, x),
30646 // (select (x == c), c, e) -> select (x == c), x, e)
30647 // where the c is an integer constant, and the "select" is the combination
30648 // of CMOV and CMP.
30650 // The rationale for this change is that the conditional-move from a constant
30651 // needs two instructions, however, conditional-move from a register needs
30652 // only one instruction.
30654 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30655 // some instruction-combining opportunities. This opt needs to be
30656 // postponed as late as possible.
30658 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30659 // the DCI.xxxx conditions are provided to postpone the optimization as
30660 // late as possible.
30662 ConstantSDNode *CmpAgainst = nullptr;
30663 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30664 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30665 !isa<ConstantSDNode>(Cond.getOperand(0))) {
30667 if (CC == X86::COND_NE &&
30668 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30669 CC = X86::GetOppositeBranchCondition(CC);
30670 std::swap(TrueOp, FalseOp);
30673 if (CC == X86::COND_E &&
30674 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30675 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30676 DAG.getConstant(CC, DL, MVT::i8), Cond };
30677 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30682 // Fold and/or of setcc's to double CMOV:
30683 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30684 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30686 // This combine lets us generate:
30687 // cmovcc1 (jcc1 if we don't have CMOV)
30693 // cmovne (jne if we don't have CMOV)
30694 // When we can't use the CMOV instruction, it might increase branch
30696 // When we can use CMOV, or when there is no mispredict, this improves
30697 // throughput and reduces register pressure.
30699 if (CC == X86::COND_NE) {
30701 X86::CondCode CC0, CC1;
30703 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30705 std::swap(FalseOp, TrueOp);
30706 CC0 = X86::GetOppositeBranchCondition(CC0);
30707 CC1 = X86::GetOppositeBranchCondition(CC1);
30710 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30712 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30713 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30714 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30715 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30723 /// Different mul shrinking modes.
30724 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30726 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30727 EVT VT = N->getOperand(0).getValueType();
30728 if (VT.getScalarSizeInBits() != 32)
30731 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30732 unsigned SignBits[2] = {1, 1};
30733 bool IsPositive[2] = {false, false};
30734 for (unsigned i = 0; i < 2; i++) {
30735 SDValue Opd = N->getOperand(i);
30737 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30738 // compute signbits for it separately.
30739 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30740 // For anyextend, it is safe to assume an appropriate number of leading
30742 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30744 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30749 IsPositive[i] = true;
30750 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30751 // All the operands of BUILD_VECTOR need to be int constant.
30752 // Find the smallest value range which all the operands belong to.
30754 IsPositive[i] = true;
30755 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30756 if (SubOp.isUndef())
30758 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30761 APInt IntVal = CN->getAPIntValue();
30762 if (IntVal.isNegative())
30763 IsPositive[i] = false;
30764 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30767 SignBits[i] = DAG.ComputeNumSignBits(Opd);
30768 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30769 IsPositive[i] = true;
30773 bool AllPositive = IsPositive[0] && IsPositive[1];
30774 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30775 // When ranges are from -128 ~ 127, use MULS8 mode.
30776 if (MinSignBits >= 25)
30778 // When ranges are from 0 ~ 255, use MULU8 mode.
30779 else if (AllPositive && MinSignBits >= 24)
30781 // When ranges are from -32768 ~ 32767, use MULS16 mode.
30782 else if (MinSignBits >= 17)
30784 // When ranges are from 0 ~ 65535, use MULU16 mode.
30785 else if (AllPositive && MinSignBits >= 16)
30792 /// When the operands of vector mul are extended from smaller size values,
30793 /// like i8 and i16, the type of mul may be shrinked to generate more
30794 /// efficient code. Two typical patterns are handled:
30796 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30797 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30798 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30799 /// %5 = mul <N x i32> %2, %4
30802 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30803 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30804 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30805 /// %5 = mul <N x i32> %2, %4
30807 /// There are four mul shrinking modes:
30808 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30809 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30810 /// generate pmullw+sext32 for it (MULS8 mode).
30811 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30812 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30813 /// generate pmullw+zext32 for it (MULU8 mode).
30814 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30815 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30816 /// generate pmullw+pmulhw for it (MULS16 mode).
30817 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30818 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30819 /// generate pmullw+pmulhuw for it (MULU16 mode).
30820 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30821 const X86Subtarget &Subtarget) {
30822 // Check for legality
30823 // pmullw/pmulhw are not supported by SSE.
30824 if (!Subtarget.hasSSE2())
30827 // Check for profitability
30828 // pmulld is supported since SSE41. It is better to use pmulld
30829 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30831 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30832 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30836 if (!canReduceVMulWidth(N, DAG, Mode))
30840 SDValue N0 = N->getOperand(0);
30841 SDValue N1 = N->getOperand(1);
30842 EVT VT = N->getOperand(0).getValueType();
30843 unsigned RegSize = 128;
30844 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30846 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30847 // Shrink the operands of mul.
30848 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30849 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30851 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30852 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30853 // lower part is needed.
30854 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30855 if (Mode == MULU8 || Mode == MULS8) {
30856 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30859 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30860 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30861 // the higher part is also needed.
30862 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30863 ReducedVT, NewN0, NewN1);
30865 // Repack the lower part and higher part result of mul into a wider
30867 // Generate shuffle functioning as punpcklwd.
30868 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30869 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30870 ShuffleMask[2 * i] = i;
30871 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30874 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30875 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30876 // Generate shuffle functioning as punpckhwd.
30877 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30878 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30879 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30882 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30883 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30884 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30887 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30888 // to legalize the mul explicitly because implicit legalization for type
30889 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30890 // instructions which will not exist when we explicitly legalize it by
30891 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30892 // <4 x i16> undef).
30894 // Legalize the operands of mul.
30895 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30896 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30897 if ((RegSize % ReducedSizeInBits) != 0)
30900 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30901 DAG.getUNDEF(ReducedVT));
30903 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30905 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30907 if (Mode == MULU8 || Mode == MULS8) {
30908 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30910 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30912 // convert the type of mul result to VT.
30913 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30914 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30915 : ISD::SIGN_EXTEND_VECTOR_INREG,
30917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30918 DAG.getIntPtrConstant(0, DL));
30920 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30921 // MULU16/MULS16, both parts are needed.
30922 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30923 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30924 OpsVT, NewN0, NewN1);
30926 // Repack the lower part and higher part result of mul into a wider
30927 // result. Make sure the type of mul result is VT.
30928 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30929 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30930 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30931 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30932 DAG.getIntPtrConstant(0, DL));
30937 /// Optimize a single multiply with constant into two operations in order to
30938 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
30939 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
30940 TargetLowering::DAGCombinerInfo &DCI,
30941 const X86Subtarget &Subtarget) {
30942 EVT VT = N->getValueType(0);
30943 if (DCI.isBeforeLegalize() && VT.isVector())
30944 return reduceVMULWidth(N, DAG, Subtarget);
30946 // An imul is usually smaller than the alternative sequence.
30947 if (DAG.getMachineFunction().getFunction()->optForMinSize())
30950 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
30953 if (VT != MVT::i64 && VT != MVT::i32)
30956 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
30959 uint64_t MulAmt = C->getZExtValue();
30960 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
30963 uint64_t MulAmt1 = 0;
30964 uint64_t MulAmt2 = 0;
30965 if ((MulAmt % 9) == 0) {
30967 MulAmt2 = MulAmt / 9;
30968 } else if ((MulAmt % 5) == 0) {
30970 MulAmt2 = MulAmt / 5;
30971 } else if ((MulAmt % 3) == 0) {
30973 MulAmt2 = MulAmt / 3;
30979 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
30981 if (isPowerOf2_64(MulAmt2) &&
30982 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
30983 // If second multiplifer is pow2, issue it first. We want the multiply by
30984 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30986 std::swap(MulAmt1, MulAmt2);
30988 if (isPowerOf2_64(MulAmt1))
30989 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30990 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30992 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30993 DAG.getConstant(MulAmt1, DL, VT));
30995 if (isPowerOf2_64(MulAmt2))
30996 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30997 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
30999 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31000 DAG.getConstant(MulAmt2, DL, VT));
31004 assert(MulAmt != 0 &&
31005 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31006 "Both cases that could cause potential overflows should have "
31007 "already been handled.");
31008 int64_t SignMulAmt = C->getSExtValue();
31009 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31010 (SignMulAmt != -INT64_MAX)) {
31011 int NumSign = SignMulAmt > 0 ? 1 : -1;
31012 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31013 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31014 if (IsPowerOf2_64PlusOne) {
31015 // (mul x, 2^N + 1) => (add (shl x, N), x)
31016 NewMul = DAG.getNode(
31017 ISD::ADD, DL, VT, N->getOperand(0),
31018 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31019 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31021 } else if (IsPowerOf2_64MinusOne) {
31022 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31023 NewMul = DAG.getNode(
31025 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31026 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31030 // To negate, subtract the number from zero
31031 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31033 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31038 // Do not add new nodes to DAG combiner worklist.
31039 DCI.CombineTo(N, NewMul, false);
31044 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31045 SDValue N0 = N->getOperand(0);
31046 SDValue N1 = N->getOperand(1);
31047 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31048 EVT VT = N0.getValueType();
31050 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31051 // since the result of setcc_c is all zero's or all ones.
31052 if (VT.isInteger() && !VT.isVector() &&
31053 N1C && N0.getOpcode() == ISD::AND &&
31054 N0.getOperand(1).getOpcode() == ISD::Constant) {
31055 SDValue N00 = N0.getOperand(0);
31056 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31057 const APInt &ShAmt = N1C->getAPIntValue();
31058 Mask = Mask.shl(ShAmt);
31059 bool MaskOK = false;
31060 // We can handle cases concerning bit-widening nodes containing setcc_c if
31061 // we carefully interrogate the mask to make sure we are semantics
31063 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31064 // of the underlying setcc_c operation if the setcc_c was zero extended.
31065 // Consider the following example:
31066 // zext(setcc_c) -> i32 0x0000FFFF
31067 // c1 -> i32 0x0000FFFF
31068 // c2 -> i32 0x00000001
31069 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31070 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31071 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31073 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31074 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31076 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31077 N00.getOpcode() == ISD::ANY_EXTEND) &&
31078 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31079 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31081 if (MaskOK && Mask != 0) {
31083 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31087 // Hardware support for vector shifts is sparse which makes us scalarize the
31088 // vector operations in many cases. Also, on sandybridge ADD is faster than
31090 // (shl V, 1) -> add V,V
31091 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31092 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31093 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31094 // We shift all of the values by one. In many cases we do not have
31095 // hardware support for this operation. This is better expressed as an ADD
31097 if (N1SplatC->getAPIntValue() == 1)
31098 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31104 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31105 SDValue N0 = N->getOperand(0);
31106 SDValue N1 = N->getOperand(1);
31107 EVT VT = N0.getValueType();
31108 unsigned Size = VT.getSizeInBits();
31110 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31111 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31112 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31113 // depending on sign of (SarConst - [56,48,32,24,16])
31115 // sexts in X86 are MOVs. The MOVs have the same code size
31116 // as above SHIFTs (only SHIFT on 1 has lower code size).
31117 // However the MOVs have 2 advantages to a SHIFT:
31118 // 1. MOVs can write to a register that differs from source
31119 // 2. MOVs accept memory operands
31121 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31122 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31123 N0.getOperand(1).getOpcode() != ISD::Constant)
31126 SDValue N00 = N0.getOperand(0);
31127 SDValue N01 = N0.getOperand(1);
31128 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31129 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31130 EVT CVT = N1.getValueType();
31132 if (SarConst.isNegative())
31135 for (MVT SVT : MVT::integer_valuetypes()) {
31136 unsigned ShiftSize = SVT.getSizeInBits();
31137 // skipping types without corresponding sext/zext and
31138 // ShlConst that is not one of [56,48,32,24,16]
31139 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31143 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31144 SarConst = SarConst - (Size - ShiftSize);
31147 else if (SarConst.isNegative())
31148 return DAG.getNode(ISD::SHL, DL, VT, NN,
31149 DAG.getConstant(-SarConst, DL, CVT));
31151 return DAG.getNode(ISD::SRA, DL, VT, NN,
31152 DAG.getConstant(SarConst, DL, CVT));
31157 /// \brief Returns a vector of 0s if the node in input is a vector logical
31158 /// shift by a constant amount which is known to be bigger than or equal
31159 /// to the vector element size in bits.
31160 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31161 const X86Subtarget &Subtarget) {
31162 EVT VT = N->getValueType(0);
31164 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31165 (!Subtarget.hasInt256() ||
31166 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31169 SDValue Amt = N->getOperand(1);
31171 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31172 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31173 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31174 unsigned MaxAmount =
31175 VT.getSimpleVT().getScalarSizeInBits();
31177 // SSE2/AVX2 logical shifts always return a vector of 0s
31178 // if the shift amount is bigger than or equal to
31179 // the element size. The constant shift amount will be
31180 // encoded as a 8-bit immediate.
31181 if (ShiftAmt.trunc(8).uge(MaxAmount))
31182 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31188 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31189 TargetLowering::DAGCombinerInfo &DCI,
31190 const X86Subtarget &Subtarget) {
31191 if (N->getOpcode() == ISD::SHL)
31192 if (SDValue V = combineShiftLeft(N, DAG))
31195 if (N->getOpcode() == ISD::SRA)
31196 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31199 // Try to fold this logical shift into a zero vector.
31200 if (N->getOpcode() != ISD::SRA)
31201 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31207 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31208 TargetLowering::DAGCombinerInfo &DCI,
31209 const X86Subtarget &Subtarget) {
31210 unsigned Opcode = N->getOpcode();
31211 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31212 X86ISD::VSRLI == Opcode) &&
31213 "Unexpected shift opcode");
31214 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31215 EVT VT = N->getValueType(0);
31216 SDValue N0 = N->getOperand(0);
31217 SDValue N1 = N->getOperand(1);
31218 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31219 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31220 "Unexpected value type");
31222 // Out of range logical bit shifts are guaranteed to be zero.
31223 // Out of range arithmetic bit shifts splat the sign bit.
31224 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31225 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31227 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31229 ShiftVal = NumBitsPerElt - 1;
31232 // Shift N0 by zero -> N0.
31236 // Shift zero -> zero.
31237 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31238 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31240 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31241 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31242 // TODO - support other sra opcodes as needed.
31243 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31244 N0.getOpcode() == X86ISD::VSRAI)
31245 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31247 // We can decode 'whole byte' logical bit shifts as shuffles.
31248 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31250 SmallVector<int, 1> NonceMask; // Just a placeholder.
31251 NonceMask.push_back(0);
31252 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31253 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31255 return SDValue(); // This routine will use CombineTo to replace N.
31258 // Constant Folding.
31260 SmallVector<APInt, 32> EltBits;
31261 if (N->isOnlyUserOf(N0.getNode()) &&
31262 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31263 assert(EltBits.size() == VT.getVectorNumElements() &&
31264 "Unexpected shift value type");
31265 unsigned ShiftImm = ShiftVal.getZExtValue();
31266 for (APInt &Elt : EltBits) {
31267 if (X86ISD::VSHLI == Opcode)
31268 Elt = Elt.shl(ShiftImm);
31269 else if (X86ISD::VSRAI == Opcode)
31270 Elt = Elt.ashr(ShiftImm);
31272 Elt.lshrInPlace(ShiftImm);
31274 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31280 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31281 TargetLowering::DAGCombinerInfo &DCI,
31282 const X86Subtarget &Subtarget) {
31284 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31285 (N->getOpcode() == X86ISD::PINSRW &&
31286 N->getValueType(0) == MVT::v8i16)) &&
31287 "Unexpected vector insertion");
31289 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31291 SmallVector<int, 1> NonceMask; // Just a placeholder.
31292 NonceMask.push_back(0);
31293 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31294 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31299 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31300 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31301 /// OR -> CMPNEQSS.
31302 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31303 TargetLowering::DAGCombinerInfo &DCI,
31304 const X86Subtarget &Subtarget) {
31307 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31308 // we're requiring SSE2 for both.
31309 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31310 SDValue N0 = N->getOperand(0);
31311 SDValue N1 = N->getOperand(1);
31312 SDValue CMP0 = N0->getOperand(1);
31313 SDValue CMP1 = N1->getOperand(1);
31316 // The SETCCs should both refer to the same CMP.
31317 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31320 SDValue CMP00 = CMP0->getOperand(0);
31321 SDValue CMP01 = CMP0->getOperand(1);
31322 EVT VT = CMP00.getValueType();
31324 if (VT == MVT::f32 || VT == MVT::f64) {
31325 bool ExpectingFlags = false;
31326 // Check for any users that want flags:
31327 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31328 !ExpectingFlags && UI != UE; ++UI)
31329 switch (UI->getOpcode()) {
31334 ExpectingFlags = true;
31336 case ISD::CopyToReg:
31337 case ISD::SIGN_EXTEND:
31338 case ISD::ZERO_EXTEND:
31339 case ISD::ANY_EXTEND:
31343 if (!ExpectingFlags) {
31344 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31345 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31347 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31348 X86::CondCode tmp = cc0;
31353 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31354 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31355 // FIXME: need symbolic constants for these magic numbers.
31356 // See X86ATTInstPrinter.cpp:printSSECC().
31357 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31358 if (Subtarget.hasAVX512()) {
31359 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
31361 DAG.getConstant(x86cc, DL, MVT::i8));
31362 if (N->getValueType(0) != MVT::i1)
31363 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
31367 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31368 CMP00.getValueType(), CMP00, CMP01,
31369 DAG.getConstant(x86cc, DL,
31372 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31373 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31375 if (is64BitFP && !Subtarget.is64Bit()) {
31376 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31377 // 64-bit integer, since that's not a legal type. Since
31378 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31379 // bits, but can do this little dance to extract the lowest 32 bits
31380 // and work with those going forward.
31381 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31383 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31384 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31385 Vector32, DAG.getIntPtrConstant(0, DL));
31389 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31390 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31391 DAG.getConstant(1, DL, IntVT));
31392 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31394 return OneBitOfTruth;
31402 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31403 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31404 assert(N->getOpcode() == ISD::AND);
31406 EVT VT = N->getValueType(0);
31407 SDValue N0 = N->getOperand(0);
31408 SDValue N1 = N->getOperand(1);
31411 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31414 if (N0.getOpcode() == ISD::XOR &&
31415 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31416 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31418 if (N1.getOpcode() == ISD::XOR &&
31419 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31420 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31425 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31426 // register. In most cases we actually compare or select YMM-sized registers
31427 // and mixing the two types creates horrible code. This method optimizes
31428 // some of the transition sequences.
31429 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31430 TargetLowering::DAGCombinerInfo &DCI,
31431 const X86Subtarget &Subtarget) {
31432 EVT VT = N->getValueType(0);
31433 if (!VT.is256BitVector())
31436 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31437 N->getOpcode() == ISD::ZERO_EXTEND ||
31438 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31440 SDValue Narrow = N->getOperand(0);
31441 EVT NarrowVT = Narrow->getValueType(0);
31442 if (!NarrowVT.is128BitVector())
31445 if (Narrow->getOpcode() != ISD::XOR &&
31446 Narrow->getOpcode() != ISD::AND &&
31447 Narrow->getOpcode() != ISD::OR)
31450 SDValue N0 = Narrow->getOperand(0);
31451 SDValue N1 = Narrow->getOperand(1);
31454 // The Left side has to be a trunc.
31455 if (N0.getOpcode() != ISD::TRUNCATE)
31458 // The type of the truncated inputs.
31459 EVT WideVT = N0->getOperand(0)->getValueType(0);
31463 // The right side has to be a 'trunc' or a constant vector.
31464 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31465 ConstantSDNode *RHSConstSplat = nullptr;
31466 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31467 RHSConstSplat = RHSBV->getConstantSplatNode();
31468 if (!RHSTrunc && !RHSConstSplat)
31471 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31473 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31476 // Set N0 and N1 to hold the inputs to the new wide operation.
31477 N0 = N0->getOperand(0);
31478 if (RHSConstSplat) {
31479 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31480 SDValue(RHSConstSplat, 0));
31481 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31482 } else if (RHSTrunc) {
31483 N1 = N1->getOperand(0);
31486 // Generate the wide operation.
31487 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31488 unsigned Opcode = N->getOpcode();
31490 case ISD::ANY_EXTEND:
31492 case ISD::ZERO_EXTEND: {
31493 unsigned InBits = NarrowVT.getScalarSizeInBits();
31494 APInt Mask = APInt::getAllOnesValue(InBits);
31495 Mask = Mask.zext(VT.getScalarSizeInBits());
31496 return DAG.getNode(ISD::AND, DL, VT,
31497 Op, DAG.getConstant(Mask, DL, VT));
31499 case ISD::SIGN_EXTEND:
31500 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31501 Op, DAG.getValueType(NarrowVT));
31503 llvm_unreachable("Unexpected opcode");
31507 /// If both input operands of a logic op are being cast from floating point
31508 /// types, try to convert this into a floating point logic node to avoid
31509 /// unnecessary moves from SSE to integer registers.
31510 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31511 const X86Subtarget &Subtarget) {
31512 unsigned FPOpcode = ISD::DELETED_NODE;
31513 if (N->getOpcode() == ISD::AND)
31514 FPOpcode = X86ISD::FAND;
31515 else if (N->getOpcode() == ISD::OR)
31516 FPOpcode = X86ISD::FOR;
31517 else if (N->getOpcode() == ISD::XOR)
31518 FPOpcode = X86ISD::FXOR;
31520 assert(FPOpcode != ISD::DELETED_NODE &&
31521 "Unexpected input node for FP logic conversion");
31523 EVT VT = N->getValueType(0);
31524 SDValue N0 = N->getOperand(0);
31525 SDValue N1 = N->getOperand(1);
31527 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31528 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31529 (Subtarget.hasSSE2() && VT == MVT::i64))) {
31530 SDValue N00 = N0.getOperand(0);
31531 SDValue N10 = N1.getOperand(0);
31532 EVT N00Type = N00.getValueType();
31533 EVT N10Type = N10.getValueType();
31534 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31535 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31536 return DAG.getBitcast(VT, FPLogic);
31542 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31543 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31544 /// with a shift-right to eliminate loading the vector constant mask value.
31545 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31546 const X86Subtarget &Subtarget) {
31547 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31548 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31549 EVT VT0 = Op0.getValueType();
31550 EVT VT1 = Op1.getValueType();
31552 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31556 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31557 !SplatVal.isMask())
31560 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31563 unsigned EltBitWidth = VT0.getScalarSizeInBits();
31564 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31568 unsigned ShiftVal = SplatVal.countTrailingOnes();
31569 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31570 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31571 return DAG.getBitcast(N->getValueType(0), Shift);
31574 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31575 TargetLowering::DAGCombinerInfo &DCI,
31576 const X86Subtarget &Subtarget) {
31577 if (DCI.isBeforeLegalizeOps())
31580 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31583 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31586 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31589 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31592 EVT VT = N->getValueType(0);
31593 SDValue N0 = N->getOperand(0);
31594 SDValue N1 = N->getOperand(1);
31597 // Attempt to recursively combine a bitmask AND with shuffles.
31598 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31600 SmallVector<int, 1> NonceMask; // Just a placeholder.
31601 NonceMask.push_back(0);
31602 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31603 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31605 return SDValue(); // This routine will use CombineTo to replace N.
31608 // Create BEXTR instructions
31609 // BEXTR is ((X >> imm) & (2**size-1))
31610 if (VT != MVT::i32 && VT != MVT::i64)
31613 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31615 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31618 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31619 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31620 if (MaskNode && ShiftNode) {
31621 uint64_t Mask = MaskNode->getZExtValue();
31622 uint64_t Shift = ShiftNode->getZExtValue();
31623 if (isMask_64(Mask)) {
31624 uint64_t MaskSize = countPopulation(Mask);
31625 if (Shift + MaskSize <= VT.getSizeInBits())
31626 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31627 DAG.getConstant(Shift | (MaskSize << 8), DL,
31635 // (or (and (m, y), (pandn m, x)))
31637 // (vselect m, x, y)
31638 // As a special case, try to fold:
31639 // (or (and (m, (sub 0, x)), (pandn m, x)))
31641 // (sub (xor X, M), M)
31642 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31643 const X86Subtarget &Subtarget) {
31644 assert(N->getOpcode() == ISD::OR);
31646 SDValue N0 = N->getOperand(0);
31647 SDValue N1 = N->getOperand(1);
31648 EVT VT = N->getValueType(0);
31650 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
31652 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
31654 // Canonicalize pandn to RHS
31655 if (N0.getOpcode() == X86ISD::ANDNP)
31658 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31661 SDValue Mask = N1.getOperand(0);
31662 SDValue X = N1.getOperand(1);
31664 if (N0.getOperand(0) == Mask)
31665 Y = N0.getOperand(1);
31666 if (N0.getOperand(1) == Mask)
31667 Y = N0.getOperand(0);
31669 // Check to see if the mask appeared in both the AND and ANDNP.
31673 // Validate that X, Y, and Mask are bitcasts, and see through them.
31674 Mask = peekThroughBitcasts(Mask);
31675 X = peekThroughBitcasts(X);
31676 Y = peekThroughBitcasts(Y);
31678 EVT MaskVT = Mask.getValueType();
31680 // Validate that the Mask operand is a vector sra node.
31681 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
31682 // there is no psrai.b
31683 unsigned EltBits = MaskVT.getScalarSizeInBits();
31684 unsigned SraAmt = ~0;
31685 if (Mask.getOpcode() == ISD::SRA) {
31686 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
31687 if (auto *AmtConst = AmtBV->getConstantSplatNode())
31688 SraAmt = AmtConst->getZExtValue();
31689 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
31690 SDValue SraC = Mask.getOperand(1);
31691 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
31693 if ((SraAmt + 1) != EltBits)
31699 // (or (and (M, (sub 0, X)), (pandn M, X)))
31700 // which is a special case of vselect:
31701 // (vselect M, (sub 0, X), X)
31703 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31704 // We know that, if fNegate is 0 or 1:
31705 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31707 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31708 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31709 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
31710 // This lets us transform our vselect to:
31711 // (add (xor X, M), (and M, 1))
31713 // (sub (xor X, M), M)
31714 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
31715 auto IsNegV = [](SDNode *N, SDValue V) {
31716 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31717 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31720 if (IsNegV(Y.getNode(), X))
31722 else if (IsNegV(X.getNode(), Y))
31726 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
31727 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31728 SDValue SubOp2 = Mask;
31730 // If the negate was on the false side of the select, then
31731 // the operands of the SUB need to be swapped. PR 27251.
31732 // This is because the pattern being matched above is
31733 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
31734 // but if the pattern matched was
31735 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31736 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31737 // pattern also needs to be a negation of the replacement pattern above.
31738 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31739 // sub accomplishes the negation of the replacement pattern.
31741 std::swap(SubOp1, SubOp2);
31743 return DAG.getBitcast(VT,
31744 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
31748 // PBLENDVB is only available on SSE 4.1.
31749 if (!Subtarget.hasSSE41())
31752 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31754 X = DAG.getBitcast(BlendVT, X);
31755 Y = DAG.getBitcast(BlendVT, Y);
31756 Mask = DAG.getBitcast(BlendVT, Mask);
31757 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
31758 return DAG.getBitcast(VT, Mask);
31761 // Helper function for combineOrCmpEqZeroToCtlzSrl
31765 // srl(ctlz x), log2(bitsize(x))
31766 // Input pattern is checked by caller.
31767 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31768 SelectionDAG &DAG) {
31769 SDValue Cmp = Op.getOperand(1);
31770 EVT VT = Cmp.getOperand(0).getValueType();
31771 unsigned Log2b = Log2_32(VT.getSizeInBits());
31773 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31774 // The result of the shift is true or false, and on X86, the 32-bit
31775 // encoding of shr and lzcnt is more desirable.
31776 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31777 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31778 DAG.getConstant(Log2b, dl, VT));
31779 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31782 // Try to transform:
31783 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31785 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31786 // Will also attempt to match more generic cases, eg:
31787 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31788 // Only applies if the target supports the FastLZCNT feature.
31789 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31790 TargetLowering::DAGCombinerInfo &DCI,
31791 const X86Subtarget &Subtarget) {
31792 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31795 auto isORCandidate = [](SDValue N) {
31796 return (N->getOpcode() == ISD::OR && N->hasOneUse());
31799 // Check the zero extend is extending to 32-bit or more. The code generated by
31800 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31801 // instructions to clear the upper bits.
31802 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31803 !isORCandidate(N->getOperand(0)))
31806 // Check the node matches: setcc(eq, cmp 0)
31807 auto isSetCCCandidate = [](SDValue N) {
31808 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31809 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31810 N->getOperand(1).getOpcode() == X86ISD::CMP &&
31811 isNullConstant(N->getOperand(1).getOperand(1)) &&
31812 N->getOperand(1).getValueType().bitsGE(MVT::i32);
31815 SDNode *OR = N->getOperand(0).getNode();
31816 SDValue LHS = OR->getOperand(0);
31817 SDValue RHS = OR->getOperand(1);
31819 // Save nodes matching or(or, setcc(eq, cmp 0)).
31820 SmallVector<SDNode *, 2> ORNodes;
31821 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31822 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31823 ORNodes.push_back(OR);
31824 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31825 LHS = OR->getOperand(0);
31826 RHS = OR->getOperand(1);
31829 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31830 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31831 !isORCandidate(SDValue(OR, 0)))
31834 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31836 // or(srl(ctlz),srl(ctlz)).
31837 // The dag combiner can then fold it into:
31838 // srl(or(ctlz, ctlz)).
31839 EVT VT = OR->getValueType(0);
31840 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31841 SDValue Ret, NewRHS;
31842 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31843 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31848 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31849 while (ORNodes.size() > 0) {
31850 OR = ORNodes.pop_back_val();
31851 LHS = OR->getOperand(0);
31852 RHS = OR->getOperand(1);
31853 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31854 if (RHS->getOpcode() == ISD::OR)
31855 std::swap(LHS, RHS);
31856 EVT VT = OR->getValueType(0);
31857 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31860 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31864 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31869 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31870 TargetLowering::DAGCombinerInfo &DCI,
31871 const X86Subtarget &Subtarget) {
31872 if (DCI.isBeforeLegalizeOps())
31875 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31878 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31881 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31884 SDValue N0 = N->getOperand(0);
31885 SDValue N1 = N->getOperand(1);
31886 EVT VT = N->getValueType(0);
31888 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31891 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31892 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31894 // SHLD/SHRD instructions have lower register pressure, but on some
31895 // platforms they have higher latency than the equivalent
31896 // series of shifts/or that would otherwise be generated.
31897 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31898 // have higher latencies and we are not optimizing for size.
31899 if (!OptForSize && Subtarget.isSHLDSlow())
31902 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31904 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31906 if (!N0.hasOneUse() || !N1.hasOneUse())
31909 SDValue ShAmt0 = N0.getOperand(1);
31910 if (ShAmt0.getValueType() != MVT::i8)
31912 SDValue ShAmt1 = N1.getOperand(1);
31913 if (ShAmt1.getValueType() != MVT::i8)
31915 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31916 ShAmt0 = ShAmt0.getOperand(0);
31917 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31918 ShAmt1 = ShAmt1.getOperand(0);
31921 unsigned Opc = X86ISD::SHLD;
31922 SDValue Op0 = N0.getOperand(0);
31923 SDValue Op1 = N1.getOperand(0);
31924 if (ShAmt0.getOpcode() == ISD::SUB ||
31925 ShAmt0.getOpcode() == ISD::XOR) {
31926 Opc = X86ISD::SHRD;
31927 std::swap(Op0, Op1);
31928 std::swap(ShAmt0, ShAmt1);
31931 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31932 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31933 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31934 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31935 unsigned Bits = VT.getSizeInBits();
31936 if (ShAmt1.getOpcode() == ISD::SUB) {
31937 SDValue Sum = ShAmt1.getOperand(0);
31938 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31939 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31940 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31941 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
31942 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
31943 return DAG.getNode(Opc, DL, VT,
31945 DAG.getNode(ISD::TRUNCATE, DL,
31948 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
31949 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
31950 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
31951 return DAG.getNode(Opc, DL, VT,
31952 N0.getOperand(0), N1.getOperand(0),
31953 DAG.getNode(ISD::TRUNCATE, DL,
31955 } else if (ShAmt1.getOpcode() == ISD::XOR) {
31956 SDValue Mask = ShAmt1.getOperand(1);
31957 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
31958 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
31959 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
31960 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
31961 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
31962 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
31963 if (Op1.getOpcode() == InnerShift &&
31964 isa<ConstantSDNode>(Op1.getOperand(1)) &&
31965 Op1.getConstantOperandVal(1) == 1) {
31966 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31967 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31969 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
31970 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
31971 Op1.getOperand(0) == Op1.getOperand(1)) {
31972 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31973 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31982 /// Generate NEG and CMOV for integer abs.
31983 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
31984 EVT VT = N->getValueType(0);
31986 // Since X86 does not have CMOV for 8-bit integer, we don't convert
31987 // 8-bit integer abs to NEG and CMOV.
31988 if (VT.isInteger() && VT.getSizeInBits() == 8)
31991 SDValue N0 = N->getOperand(0);
31992 SDValue N1 = N->getOperand(1);
31995 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
31996 // and change it to SUB and CMOV.
31997 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
31998 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
31999 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32000 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32001 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32002 // Generate SUB & CMOV.
32003 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32004 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32005 SDValue Ops[] = {N0.getOperand(0), Neg,
32006 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32007 SDValue(Neg.getNode(), 1)};
32008 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32014 /// Try to turn tests against the signbit in the form of:
32015 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32018 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32019 // This is only worth doing if the output type is i8 or i1.
32020 EVT ResultType = N->getValueType(0);
32021 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32024 SDValue N0 = N->getOperand(0);
32025 SDValue N1 = N->getOperand(1);
32027 // We should be performing an xor against a truncated shift.
32028 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32031 // Make sure we are performing an xor against one.
32032 if (!isOneConstant(N1))
32035 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32036 SDValue Shift = N0.getOperand(0);
32037 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32040 // Make sure we are truncating from one of i16, i32 or i64.
32041 EVT ShiftTy = Shift.getValueType();
32042 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32045 // Make sure the shift amount extracts the sign bit.
32046 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32047 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32050 // Create a greater-than comparison against -1.
32051 // N.B. Using SETGE against 0 works but we want a canonical looking
32052 // comparison, using SETGT matches up with what TranslateX86CC.
32054 SDValue ShiftOp = Shift.getOperand(0);
32055 EVT ShiftOpTy = ShiftOp.getValueType();
32056 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32057 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32058 *DAG.getContext(), ResultType);
32059 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32060 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32061 if (SetCCResultType != ResultType)
32062 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32066 /// Turn vector tests of the signbit in the form of:
32067 /// xor (sra X, elt_size(X)-1), -1
32071 /// This should be called before type legalization because the pattern may not
32072 /// persist after that.
32073 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32074 const X86Subtarget &Subtarget) {
32075 EVT VT = N->getValueType(0);
32076 if (!VT.isSimple())
32079 switch (VT.getSimpleVT().SimpleTy) {
32080 default: return SDValue();
32083 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32084 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32088 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32091 // There must be a shift right algebraic before the xor, and the xor must be a
32092 // 'not' operation.
32093 SDValue Shift = N->getOperand(0);
32094 SDValue Ones = N->getOperand(1);
32095 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32096 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32099 // The shift should be smearing the sign bit across each vector element.
32100 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32104 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32105 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32106 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32109 // Create a greater-than comparison against -1. We don't use the more obvious
32110 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32111 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32114 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32115 /// is valid for the given \p Subtarget.
32116 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32117 const X86Subtarget &Subtarget) {
32118 if (!Subtarget.hasAVX512())
32121 // FIXME: Scalar type may be supported if we move it to vector register.
32122 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32125 EVT SrcElVT = SrcVT.getScalarType();
32126 EVT DstElVT = DstVT.getScalarType();
32127 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32129 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32131 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32132 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32136 /// Detect a pattern of truncation with saturation:
32137 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32138 /// Return the source value to be truncated or SDValue() if the pattern was not
32140 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32141 if (In.getOpcode() != ISD::UMIN)
32144 //Saturation with truncation. We truncate from InVT to VT.
32145 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32146 "Unexpected types for truncate operation");
32149 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32150 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32151 // the element size of the destination type.
32152 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32158 /// Detect a pattern of truncation with saturation:
32159 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32160 /// The types should allow to use VPMOVUS* instruction on AVX512.
32161 /// Return the source value to be truncated or SDValue() if the pattern was not
32163 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32164 const X86Subtarget &Subtarget) {
32165 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32167 return detectUSatPattern(In, VT);
32171 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32172 const X86Subtarget &Subtarget) {
32173 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32174 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32176 if (auto USatVal = detectUSatPattern(In, VT))
32177 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32178 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32182 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32183 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32184 /// X86ISD::AVG instruction.
32185 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32186 const X86Subtarget &Subtarget,
32188 if (!VT.isVector() || !VT.isSimple())
32190 EVT InVT = In.getValueType();
32191 unsigned NumElems = VT.getVectorNumElements();
32193 EVT ScalarVT = VT.getVectorElementType();
32194 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32195 isPowerOf2_32(NumElems)))
32198 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32199 // than the original input type (i8/i16).
32200 EVT InScalarVT = InVT.getVectorElementType();
32201 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32204 if (!Subtarget.hasSSE2())
32206 if (Subtarget.hasBWI()) {
32207 if (VT.getSizeInBits() > 512)
32209 } else if (Subtarget.hasAVX2()) {
32210 if (VT.getSizeInBits() > 256)
32213 if (VT.getSizeInBits() > 128)
32217 // Detect the following pattern:
32219 // %1 = zext <N x i8> %a to <N x i32>
32220 // %2 = zext <N x i8> %b to <N x i32>
32221 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32222 // %4 = add nuw nsw <N x i32> %3, %2
32223 // %5 = lshr <N x i32> %N, <i32 1 x N>
32224 // %6 = trunc <N x i32> %5 to <N x i8>
32226 // In AVX512, the last instruction can also be a trunc store.
32228 if (In.getOpcode() != ISD::SRL)
32231 // A lambda checking the given SDValue is a constant vector and each element
32232 // is in the range [Min, Max].
32233 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32234 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32235 if (!BV || !BV->isConstant())
32237 for (SDValue Op : V->ops()) {
32238 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32241 uint64_t Val = C->getZExtValue();
32242 if (Val < Min || Val > Max)
32248 // Check if each element of the vector is left-shifted by one.
32249 auto LHS = In.getOperand(0);
32250 auto RHS = In.getOperand(1);
32251 if (!IsConstVectorInRange(RHS, 1, 1))
32253 if (LHS.getOpcode() != ISD::ADD)
32256 // Detect a pattern of a + b + 1 where the order doesn't matter.
32257 SDValue Operands[3];
32258 Operands[0] = LHS.getOperand(0);
32259 Operands[1] = LHS.getOperand(1);
32261 // Take care of the case when one of the operands is a constant vector whose
32262 // element is in the range [1, 256].
32263 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32264 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32265 Operands[0].getOperand(0).getValueType() == VT) {
32266 // The pattern is detected. Subtract one from the constant vector, then
32267 // demote it and emit X86ISD::AVG instruction.
32268 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32269 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32270 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32271 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32275 if (Operands[0].getOpcode() == ISD::ADD)
32276 std::swap(Operands[0], Operands[1]);
32277 else if (Operands[1].getOpcode() != ISD::ADD)
32279 Operands[2] = Operands[1].getOperand(0);
32280 Operands[1] = Operands[1].getOperand(1);
32282 // Now we have three operands of two additions. Check that one of them is a
32283 // constant vector with ones, and the other two are promoted from i8/i16.
32284 for (int i = 0; i < 3; ++i) {
32285 if (!IsConstVectorInRange(Operands[i], 1, 1))
32287 std::swap(Operands[i], Operands[2]);
32289 // Check if Operands[0] and Operands[1] are results of type promotion.
32290 for (int j = 0; j < 2; ++j)
32291 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32292 Operands[j].getOperand(0).getValueType() != VT)
32295 // The pattern is detected, emit X86ISD::AVG instruction.
32296 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32297 Operands[1].getOperand(0));
32303 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32304 TargetLowering::DAGCombinerInfo &DCI,
32305 const X86Subtarget &Subtarget) {
32306 LoadSDNode *Ld = cast<LoadSDNode>(N);
32307 EVT RegVT = Ld->getValueType(0);
32308 EVT MemVT = Ld->getMemoryVT();
32310 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32312 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32313 // into two 16-byte operations.
32314 ISD::LoadExtType Ext = Ld->getExtensionType();
32316 unsigned AddressSpace = Ld->getAddressSpace();
32317 unsigned Alignment = Ld->getAlignment();
32318 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32319 Ext == ISD::NON_EXTLOAD &&
32320 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32321 AddressSpace, Alignment, &Fast) && !Fast) {
32322 unsigned NumElems = RegVT.getVectorNumElements();
32326 SDValue Ptr = Ld->getBasePtr();
32328 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32331 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32332 Alignment, Ld->getMemOperand()->getFlags());
32334 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32336 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32337 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32338 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32340 Load2.getValue(1));
32342 SDValue NewVec = DAG.getUNDEF(RegVT);
32343 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32344 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32345 return DCI.CombineTo(N, NewVec, TF, true);
32351 /// If V is a build vector of boolean constants and exactly one of those
32352 /// constants is true, return the operand index of that true element.
32353 /// Otherwise, return -1.
32354 static int getOneTrueElt(SDValue V) {
32355 // This needs to be a build vector of booleans.
32356 // TODO: Checking for the i1 type matches the IR definition for the mask,
32357 // but the mask check could be loosened to i8 or other types. That might
32358 // also require checking more than 'allOnesValue'; eg, the x86 HW
32359 // instructions only require that the MSB is set for each mask element.
32360 // The ISD::MSTORE comments/definition do not specify how the mask operand
32362 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32363 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32366 int TrueIndex = -1;
32367 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32368 for (unsigned i = 0; i < NumElts; ++i) {
32369 const SDValue &Op = BV->getOperand(i);
32372 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32375 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32376 // If we already found a one, this is too many.
32377 if (TrueIndex >= 0)
32385 /// Given a masked memory load/store operation, return true if it has one mask
32386 /// bit set. If it has one mask bit set, then also return the memory address of
32387 /// the scalar element to load/store, the vector index to insert/extract that
32388 /// scalar element, and the alignment for the scalar memory access.
32389 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32390 SelectionDAG &DAG, SDValue &Addr,
32391 SDValue &Index, unsigned &Alignment) {
32392 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32393 if (TrueMaskElt < 0)
32396 // Get the address of the one scalar element that is specified by the mask
32397 // using the appropriate offset from the base pointer.
32398 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32399 Addr = MaskedOp->getBasePtr();
32400 if (TrueMaskElt != 0) {
32401 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32402 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32405 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32406 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32410 /// If exactly one element of the mask is set for a non-extending masked load,
32411 /// it is a scalar load and vector insert.
32412 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32413 /// mask have already been optimized in IR, so we don't bother with those here.
32415 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32416 TargetLowering::DAGCombinerInfo &DCI) {
32417 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32418 // However, some target hooks may need to be added to know when the transform
32419 // is profitable. Endianness would also have to be considered.
32421 SDValue Addr, VecIndex;
32422 unsigned Alignment;
32423 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32426 // Load the one scalar element that is specified by the mask using the
32427 // appropriate offset from the base pointer.
32429 EVT VT = ML->getValueType(0);
32430 EVT EltVT = VT.getVectorElementType();
32432 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32433 Alignment, ML->getMemOperand()->getFlags());
32435 // Insert the loaded element into the appropriate place in the vector.
32436 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32438 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32442 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32443 TargetLowering::DAGCombinerInfo &DCI) {
32444 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32448 EVT VT = ML->getValueType(0);
32450 // If we are loading the first and last elements of a vector, it is safe and
32451 // always faster to load the whole vector. Replace the masked load with a
32452 // vector load and select.
32453 unsigned NumElts = VT.getVectorNumElements();
32454 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32455 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32456 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32457 if (LoadFirstElt && LoadLastElt) {
32458 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32459 ML->getMemOperand());
32460 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32461 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32464 // Convert a masked load with a constant mask into a masked load and a select.
32465 // This allows the select operation to use a faster kind of select instruction
32466 // (for example, vblendvps -> vblendps).
32468 // Don't try this if the pass-through operand is already undefined. That would
32469 // cause an infinite loop because that's what we're about to create.
32470 if (ML->getSrc0().isUndef())
32473 // The new masked load has an undef pass-through operand. The select uses the
32474 // original pass-through operand.
32475 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32476 ML->getMask(), DAG.getUNDEF(VT),
32477 ML->getMemoryVT(), ML->getMemOperand(),
32478 ML->getExtensionType());
32479 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32481 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32484 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32485 TargetLowering::DAGCombinerInfo &DCI,
32486 const X86Subtarget &Subtarget) {
32487 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32489 // TODO: Expanding load with constant mask may be optimized as well.
32490 if (Mld->isExpandingLoad())
32493 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32494 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32496 // TODO: Do some AVX512 subsets benefit from this transform?
32497 if (!Subtarget.hasAVX512())
32498 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32502 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32505 // Resolve extending loads.
32506 EVT VT = Mld->getValueType(0);
32507 unsigned NumElems = VT.getVectorNumElements();
32508 EVT LdVT = Mld->getMemoryVT();
32511 assert(LdVT != VT && "Cannot extend to the same type");
32512 unsigned ToSz = VT.getScalarSizeInBits();
32513 unsigned FromSz = LdVT.getScalarSizeInBits();
32514 // From/To sizes and ElemCount must be pow of two.
32515 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32516 "Unexpected size for extending masked load");
32518 unsigned SizeRatio = ToSz / FromSz;
32519 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32521 // Create a type on which we perform the shuffle.
32522 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32523 LdVT.getScalarType(), NumElems*SizeRatio);
32524 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32526 // Convert Src0 value.
32527 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32528 if (!Mld->getSrc0().isUndef()) {
32529 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32530 for (unsigned i = 0; i != NumElems; ++i)
32531 ShuffleVec[i] = i * SizeRatio;
32533 // Can't shuffle using an illegal type.
32534 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32535 "WideVecVT should be legal");
32536 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32537 DAG.getUNDEF(WideVecVT), ShuffleVec);
32539 // Prepare the new mask.
32541 SDValue Mask = Mld->getMask();
32542 if (Mask.getValueType() == VT) {
32543 // Mask and original value have the same type.
32544 NewMask = DAG.getBitcast(WideVecVT, Mask);
32545 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32546 for (unsigned i = 0; i != NumElems; ++i)
32547 ShuffleVec[i] = i * SizeRatio;
32548 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32549 ShuffleVec[i] = NumElems * SizeRatio;
32550 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32551 DAG.getConstant(0, dl, WideVecVT),
32554 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32555 unsigned WidenNumElts = NumElems*SizeRatio;
32556 unsigned MaskNumElts = VT.getVectorNumElements();
32557 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32560 unsigned NumConcat = WidenNumElts / MaskNumElts;
32561 SmallVector<SDValue, 16> Ops(NumConcat);
32562 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32564 for (unsigned i = 1; i != NumConcat; ++i)
32567 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32570 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32571 Mld->getBasePtr(), NewMask, WideSrc0,
32572 Mld->getMemoryVT(), Mld->getMemOperand(),
32574 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32575 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32578 /// If exactly one element of the mask is set for a non-truncating masked store,
32579 /// it is a vector extract and scalar store.
32580 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32581 /// mask have already been optimized in IR, so we don't bother with those here.
32582 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32583 SelectionDAG &DAG) {
32584 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32585 // However, some target hooks may need to be added to know when the transform
32586 // is profitable. Endianness would also have to be considered.
32588 SDValue Addr, VecIndex;
32589 unsigned Alignment;
32590 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32593 // Extract the one scalar element that is actually being stored.
32595 EVT VT = MS->getValue().getValueType();
32596 EVT EltVT = VT.getVectorElementType();
32597 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32598 MS->getValue(), VecIndex);
32600 // Store that element at the appropriate offset from the base pointer.
32601 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32602 Alignment, MS->getMemOperand()->getFlags());
32605 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32606 const X86Subtarget &Subtarget) {
32607 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32609 if (Mst->isCompressingStore())
32612 if (!Mst->isTruncatingStore())
32613 return reduceMaskedStoreToScalarStore(Mst, DAG);
32615 // Resolve truncating stores.
32616 EVT VT = Mst->getValue().getValueType();
32617 unsigned NumElems = VT.getVectorNumElements();
32618 EVT StVT = Mst->getMemoryVT();
32621 assert(StVT != VT && "Cannot truncate to the same type");
32622 unsigned FromSz = VT.getScalarSizeInBits();
32623 unsigned ToSz = StVT.getScalarSizeInBits();
32625 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32627 // The truncating store is legal in some cases. For example
32628 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32629 // are designated for truncate store.
32630 // In this case we don't need any further transformations.
32631 if (TLI.isTruncStoreLegal(VT, StVT))
32634 // From/To sizes and ElemCount must be pow of two.
32635 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32636 "Unexpected size for truncating masked store");
32637 // We are going to use the original vector elt for storing.
32638 // Accumulated smaller vector elements must be a multiple of the store size.
32639 assert (((NumElems * FromSz) % ToSz) == 0 &&
32640 "Unexpected ratio for truncating masked store");
32642 unsigned SizeRatio = FromSz / ToSz;
32643 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32645 // Create a type on which we perform the shuffle.
32646 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32647 StVT.getScalarType(), NumElems*SizeRatio);
32649 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32651 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32652 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32653 for (unsigned i = 0; i != NumElems; ++i)
32654 ShuffleVec[i] = i * SizeRatio;
32656 // Can't shuffle using an illegal type.
32657 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32658 "WideVecVT should be legal");
32660 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32661 DAG.getUNDEF(WideVecVT),
32665 SDValue Mask = Mst->getMask();
32666 if (Mask.getValueType() == VT) {
32667 // Mask and original value have the same type.
32668 NewMask = DAG.getBitcast(WideVecVT, Mask);
32669 for (unsigned i = 0; i != NumElems; ++i)
32670 ShuffleVec[i] = i * SizeRatio;
32671 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32672 ShuffleVec[i] = NumElems*SizeRatio;
32673 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32674 DAG.getConstant(0, dl, WideVecVT),
32677 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32678 unsigned WidenNumElts = NumElems*SizeRatio;
32679 unsigned MaskNumElts = VT.getVectorNumElements();
32680 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
32683 unsigned NumConcat = WidenNumElts / MaskNumElts;
32684 SmallVector<SDValue, 16> Ops(NumConcat);
32685 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32687 for (unsigned i = 1; i != NumConcat; ++i)
32690 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32693 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32694 Mst->getBasePtr(), NewMask, StVT,
32695 Mst->getMemOperand(), false);
32698 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32699 const X86Subtarget &Subtarget) {
32700 StoreSDNode *St = cast<StoreSDNode>(N);
32701 EVT VT = St->getValue().getValueType();
32702 EVT StVT = St->getMemoryVT();
32704 SDValue StoredVal = St->getOperand(1);
32705 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32707 // If we are saving a concatenation of two XMM registers and 32-byte stores
32708 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32710 unsigned AddressSpace = St->getAddressSpace();
32711 unsigned Alignment = St->getAlignment();
32712 if (VT.is256BitVector() && StVT == VT &&
32713 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32714 AddressSpace, Alignment, &Fast) &&
32716 unsigned NumElems = VT.getVectorNumElements();
32720 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32721 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32723 SDValue Ptr0 = St->getBasePtr();
32724 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32727 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32728 Alignment, St->getMemOperand()->getFlags());
32730 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32731 std::min(16U, Alignment), St->getMemOperand()->getFlags());
32732 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32735 // Optimize trunc store (of multiple scalars) to shuffle and store.
32736 // First, pack all of the elements in one place. Next, store to memory
32737 // in fewer chunks.
32738 if (St->isTruncatingStore() && VT.isVector()) {
32739 // Check if we can detect an AVG pattern from the truncation. If yes,
32740 // replace the trunc store by a normal store with the result of X86ISD::AVG
32742 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32744 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32745 St->getPointerInfo(), St->getAlignment(),
32746 St->getMemOperand()->getFlags());
32749 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32750 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32751 dl, Val, St->getBasePtr(),
32752 St->getMemoryVT(), St->getMemOperand(), DAG);
32754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32755 unsigned NumElems = VT.getVectorNumElements();
32756 assert(StVT != VT && "Cannot truncate to the same type");
32757 unsigned FromSz = VT.getScalarSizeInBits();
32758 unsigned ToSz = StVT.getScalarSizeInBits();
32760 // The truncating store is legal in some cases. For example
32761 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32762 // are designated for truncate store.
32763 // In this case we don't need any further transformations.
32764 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32767 // From, To sizes and ElemCount must be pow of two
32768 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32769 // We are going to use the original vector elt for storing.
32770 // Accumulated smaller vector elements must be a multiple of the store size.
32771 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32773 unsigned SizeRatio = FromSz / ToSz;
32775 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32777 // Create a type on which we perform the shuffle
32778 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32779 StVT.getScalarType(), NumElems*SizeRatio);
32781 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32783 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32784 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32785 for (unsigned i = 0; i != NumElems; ++i)
32786 ShuffleVec[i] = i * SizeRatio;
32788 // Can't shuffle using an illegal type.
32789 if (!TLI.isTypeLegal(WideVecVT))
32792 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32793 DAG.getUNDEF(WideVecVT),
32795 // At this point all of the data is stored at the bottom of the
32796 // register. We now need to save it to mem.
32798 // Find the largest store unit
32799 MVT StoreType = MVT::i8;
32800 for (MVT Tp : MVT::integer_valuetypes()) {
32801 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32805 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32806 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32807 (64 <= NumElems * ToSz))
32808 StoreType = MVT::f64;
32810 // Bitcast the original vector into a vector of store-size units
32811 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32812 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32813 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32814 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32815 SmallVector<SDValue, 8> Chains;
32816 SDValue Ptr = St->getBasePtr();
32818 // Perform one or more big stores into memory.
32819 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32820 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32821 StoreType, ShuffWide,
32822 DAG.getIntPtrConstant(i, dl));
32824 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32825 St->getAlignment(), St->getMemOperand()->getFlags());
32826 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32827 Chains.push_back(Ch);
32830 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32833 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32834 // the FP state in cases where an emms may be missing.
32835 // A preferable solution to the general problem is to figure out the right
32836 // places to insert EMMS. This qualifies as a quick hack.
32838 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32839 if (VT.getSizeInBits() != 64)
32842 const Function *F = DAG.getMachineFunction().getFunction();
32843 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32845 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32846 if ((VT.isVector() ||
32847 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32848 isa<LoadSDNode>(St->getValue()) &&
32849 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32850 St->getChain().hasOneUse() && !St->isVolatile()) {
32851 SDNode* LdVal = St->getValue().getNode();
32852 LoadSDNode *Ld = nullptr;
32853 int TokenFactorIndex = -1;
32854 SmallVector<SDValue, 8> Ops;
32855 SDNode* ChainVal = St->getChain().getNode();
32856 // Must be a store of a load. We currently handle two cases: the load
32857 // is a direct child, and it's under an intervening TokenFactor. It is
32858 // possible to dig deeper under nested TokenFactors.
32859 if (ChainVal == LdVal)
32860 Ld = cast<LoadSDNode>(St->getChain());
32861 else if (St->getValue().hasOneUse() &&
32862 ChainVal->getOpcode() == ISD::TokenFactor) {
32863 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32864 if (ChainVal->getOperand(i).getNode() == LdVal) {
32865 TokenFactorIndex = i;
32866 Ld = cast<LoadSDNode>(St->getValue());
32868 Ops.push_back(ChainVal->getOperand(i));
32872 if (!Ld || !ISD::isNormalLoad(Ld))
32875 // If this is not the MMX case, i.e. we are just turning i64 load/store
32876 // into f64 load/store, avoid the transformation if there are multiple
32877 // uses of the loaded value.
32878 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32883 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32884 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32886 if (Subtarget.is64Bit() || F64IsLegal) {
32887 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32888 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32889 Ld->getPointerInfo(), Ld->getAlignment(),
32890 Ld->getMemOperand()->getFlags());
32891 SDValue NewChain = NewLd.getValue(1);
32892 if (TokenFactorIndex >= 0) {
32893 Ops.push_back(NewChain);
32894 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32896 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32897 St->getPointerInfo(), St->getAlignment(),
32898 St->getMemOperand()->getFlags());
32901 // Otherwise, lower to two pairs of 32-bit loads / stores.
32902 SDValue LoAddr = Ld->getBasePtr();
32903 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32905 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32906 Ld->getPointerInfo(), Ld->getAlignment(),
32907 Ld->getMemOperand()->getFlags());
32908 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32909 Ld->getPointerInfo().getWithOffset(4),
32910 MinAlign(Ld->getAlignment(), 4),
32911 Ld->getMemOperand()->getFlags());
32913 SDValue NewChain = LoLd.getValue(1);
32914 if (TokenFactorIndex >= 0) {
32915 Ops.push_back(LoLd);
32916 Ops.push_back(HiLd);
32917 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32920 LoAddr = St->getBasePtr();
32921 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32924 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32925 St->getAlignment(), St->getMemOperand()->getFlags());
32926 SDValue HiSt = DAG.getStore(
32927 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32928 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32929 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32932 // This is similar to the above case, but here we handle a scalar 64-bit
32933 // integer store that is extracted from a vector on a 32-bit target.
32934 // If we have SSE2, then we can treat it like a floating-point double
32935 // to get past legalization. The execution dependencies fixup pass will
32936 // choose the optimal machine instruction for the store if this really is
32937 // an integer or v2f32 rather than an f64.
32938 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32939 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32940 SDValue OldExtract = St->getOperand(1);
32941 SDValue ExtOp0 = OldExtract.getOperand(0);
32942 unsigned VecSize = ExtOp0.getValueSizeInBits();
32943 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
32944 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
32945 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
32946 BitCast, OldExtract.getOperand(1));
32947 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
32948 St->getPointerInfo(), St->getAlignment(),
32949 St->getMemOperand()->getFlags());
32955 /// Return 'true' if this vector operation is "horizontal"
32956 /// and return the operands for the horizontal operation in LHS and RHS. A
32957 /// horizontal operation performs the binary operation on successive elements
32958 /// of its first operand, then on successive elements of its second operand,
32959 /// returning the resulting values in a vector. For example, if
32960 /// A = < float a0, float a1, float a2, float a3 >
32962 /// B = < float b0, float b1, float b2, float b3 >
32963 /// then the result of doing a horizontal operation on A and B is
32964 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
32965 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
32966 /// A horizontal-op B, for some already available A and B, and if so then LHS is
32967 /// set to A, RHS to B, and the routine returns 'true'.
32968 /// Note that the binary operation should have the property that if one of the
32969 /// operands is UNDEF then the result is UNDEF.
32970 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
32971 // Look for the following pattern: if
32972 // A = < float a0, float a1, float a2, float a3 >
32973 // B = < float b0, float b1, float b2, float b3 >
32975 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
32976 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
32977 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
32978 // which is A horizontal-op B.
32980 // At least one of the operands should be a vector shuffle.
32981 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
32982 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
32985 MVT VT = LHS.getSimpleValueType();
32987 assert((VT.is128BitVector() || VT.is256BitVector()) &&
32988 "Unsupported vector type for horizontal add/sub");
32990 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
32991 // operate independently on 128-bit lanes.
32992 unsigned NumElts = VT.getVectorNumElements();
32993 unsigned NumLanes = VT.getSizeInBits()/128;
32994 unsigned NumLaneElts = NumElts / NumLanes;
32995 assert((NumLaneElts % 2 == 0) &&
32996 "Vector type should have an even number of elements in each lane");
32997 unsigned HalfLaneElts = NumLaneElts/2;
32999 // View LHS in the form
33000 // LHS = VECTOR_SHUFFLE A, B, LMask
33001 // If LHS is not a shuffle then pretend it is the shuffle
33002 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33003 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33006 SmallVector<int, 16> LMask(NumElts);
33007 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33008 if (!LHS.getOperand(0).isUndef())
33009 A = LHS.getOperand(0);
33010 if (!LHS.getOperand(1).isUndef())
33011 B = LHS.getOperand(1);
33012 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33013 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33015 if (!LHS.isUndef())
33017 for (unsigned i = 0; i != NumElts; ++i)
33021 // Likewise, view RHS in the form
33022 // RHS = VECTOR_SHUFFLE C, D, RMask
33024 SmallVector<int, 16> RMask(NumElts);
33025 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33026 if (!RHS.getOperand(0).isUndef())
33027 C = RHS.getOperand(0);
33028 if (!RHS.getOperand(1).isUndef())
33029 D = RHS.getOperand(1);
33030 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33031 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33033 if (!RHS.isUndef())
33035 for (unsigned i = 0; i != NumElts; ++i)
33039 // Check that the shuffles are both shuffling the same vectors.
33040 if (!(A == C && B == D) && !(A == D && B == C))
33043 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33044 if (!A.getNode() && !B.getNode())
33047 // If A and B occur in reverse order in RHS, then "swap" them (which means
33048 // rewriting the mask).
33050 ShuffleVectorSDNode::commuteMask(RMask);
33052 // At this point LHS and RHS are equivalent to
33053 // LHS = VECTOR_SHUFFLE A, B, LMask
33054 // RHS = VECTOR_SHUFFLE A, B, RMask
33055 // Check that the masks correspond to performing a horizontal operation.
33056 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33057 for (unsigned i = 0; i != NumLaneElts; ++i) {
33058 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33060 // Ignore any UNDEF components.
33061 if (LIdx < 0 || RIdx < 0 ||
33062 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33063 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33066 // Check that successive elements are being operated on. If not, this is
33067 // not a horizontal operation.
33068 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33069 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33070 if (!(LIdx == Index && RIdx == Index + 1) &&
33071 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33076 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33077 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33081 /// Do target-specific dag combines on floating-point adds/subs.
33082 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33083 const X86Subtarget &Subtarget) {
33084 EVT VT = N->getValueType(0);
33085 SDValue LHS = N->getOperand(0);
33086 SDValue RHS = N->getOperand(1);
33087 bool IsFadd = N->getOpcode() == ISD::FADD;
33088 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33090 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33091 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33092 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33093 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33094 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33095 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33100 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33102 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33103 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33104 const X86Subtarget &Subtarget,
33106 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33107 SDValue Src = N->getOperand(0);
33108 unsigned Opcode = Src.getOpcode();
33109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33111 EVT VT = N->getValueType(0);
33112 EVT SrcVT = Src.getValueType();
33114 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33115 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33117 // Repeated operand, so we are only trading one output truncation for
33118 // one input truncation.
33122 // See if either operand has been extended from a smaller/equal size to
33123 // the truncation size, allowing a truncation to combine with the extend.
33124 unsigned Opcode0 = Op0.getOpcode();
33125 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33126 Opcode0 == ISD::ZERO_EXTEND) &&
33127 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33130 unsigned Opcode1 = Op1.getOpcode();
33131 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33132 Opcode1 == ISD::ZERO_EXTEND) &&
33133 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33136 // See if either operand is a single use constant which can be constant
33138 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33139 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33140 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33141 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33144 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33145 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33146 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33147 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33150 // Don't combine if the operation has other uses.
33151 if (!N->isOnlyUserOf(Src.getNode()))
33154 // Only support vector truncation for now.
33155 // TODO: i64 scalar math would benefit as well.
33156 if (!VT.isVector())
33159 // In most cases its only worth pre-truncating if we're only facing the cost
33160 // of one truncation.
33161 // i.e. if one of the inputs will constant fold or the input is repeated.
33166 SDValue Op0 = Src.getOperand(0);
33167 SDValue Op1 = Src.getOperand(1);
33168 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33169 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33170 return TruncateArithmetic(Op0, Op1);
33175 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33176 // better to truncate if we have the chance.
33177 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33178 !TLI.isOperationLegal(Opcode, SrcVT))
33179 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33182 SDValue Op0 = Src.getOperand(0);
33183 SDValue Op1 = Src.getOperand(1);
33184 if (TLI.isOperationLegal(Opcode, VT) &&
33185 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33186 return TruncateArithmetic(Op0, Op1);
33194 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33196 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33197 SmallVector<SDValue, 8> &Regs) {
33198 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33199 Regs[0].getValueType() == MVT::v2i64));
33200 EVT OutVT = N->getValueType(0);
33201 EVT OutSVT = OutVT.getVectorElementType();
33202 EVT InVT = Regs[0].getValueType();
33203 EVT InSVT = InVT.getVectorElementType();
33206 // First, use mask to unset all bits that won't appear in the result.
33207 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33208 "OutSVT can only be either i8 or i16.");
33210 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33211 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33212 for (auto &Reg : Regs)
33213 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33215 MVT UnpackedVT, PackedVT;
33216 if (OutSVT == MVT::i8) {
33217 UnpackedVT = MVT::v8i16;
33218 PackedVT = MVT::v16i8;
33220 UnpackedVT = MVT::v4i32;
33221 PackedVT = MVT::v8i16;
33224 // In each iteration, truncate the type by a half size.
33225 auto RegNum = Regs.size();
33226 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33227 j < e; j *= 2, RegNum /= 2) {
33228 for (unsigned i = 0; i < RegNum; i++)
33229 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33230 for (unsigned i = 0; i < RegNum / 2; i++)
33231 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33235 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33236 // then extract a subvector as the result since v8i8 is not a legal type.
33237 if (OutVT == MVT::v8i8) {
33238 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33239 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33240 DAG.getIntPtrConstant(0, DL));
33242 } else if (RegNum > 1) {
33243 Regs.resize(RegNum);
33244 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33249 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33251 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33253 SmallVector<SDValue, 8> &Regs) {
33254 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33255 EVT OutVT = N->getValueType(0);
33258 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33259 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33260 for (auto &Reg : Regs) {
33261 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33263 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33267 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33268 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33271 if (Regs.size() > 2) {
33272 Regs.resize(Regs.size() / 2);
33273 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33278 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33279 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33280 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33281 /// element that is extracted from a vector and then truncated, and it is
33282 /// difficult to do this optimization based on them.
33283 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33284 const X86Subtarget &Subtarget) {
33285 EVT OutVT = N->getValueType(0);
33286 if (!OutVT.isVector())
33289 SDValue In = N->getOperand(0);
33290 if (!In.getValueType().isSimple())
33293 EVT InVT = In.getValueType();
33294 unsigned NumElems = OutVT.getVectorNumElements();
33296 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33297 // SSE2, and we need to take care of it specially.
33298 // AVX512 provides vpmovdb.
33299 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33302 EVT OutSVT = OutVT.getVectorElementType();
33303 EVT InSVT = InVT.getVectorElementType();
33304 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33305 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33309 // SSSE3's pshufb results in less instructions in the cases below.
33310 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33311 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33312 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33317 // Split a long vector into vectors of legal type.
33318 unsigned RegNum = InVT.getSizeInBits() / 128;
33319 SmallVector<SDValue, 8> SubVec(RegNum);
33320 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33321 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33323 for (unsigned i = 0; i < RegNum; i++)
33324 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33325 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33327 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33328 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33329 // truncate 2 x v4i32 to v8i16.
33330 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33331 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33332 else if (InSVT == MVT::i32)
33333 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33338 /// This function transforms vector truncation of 'all or none' bits values.
33339 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33340 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33342 const X86Subtarget &Subtarget) {
33343 // Requires SSE2 but AVX512 has fast truncate.
33344 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33347 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33350 SDValue In = N->getOperand(0);
33351 if (!In.getValueType().isSimple())
33354 MVT VT = N->getValueType(0).getSimpleVT();
33355 MVT SVT = VT.getScalarType();
33357 MVT InVT = In.getValueType().getSimpleVT();
33358 MVT InSVT = InVT.getScalarType();
33360 // Use PACKSS if the input is a splatted sign bit.
33361 // e.g. Comparison result, sext_in_reg, etc.
33362 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33363 if (NumSignBits != InSVT.getSizeInBits())
33366 // Check we have a truncation suited for PACKSS.
33367 if (!VT.is128BitVector() && !VT.is256BitVector())
33369 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33371 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33374 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33377 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33378 const X86Subtarget &Subtarget) {
33379 EVT VT = N->getValueType(0);
33380 SDValue Src = N->getOperand(0);
33383 // Attempt to pre-truncate inputs to arithmetic ops instead.
33384 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33387 // Try to detect AVG pattern first.
33388 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33391 // Try to combine truncation with unsigned saturation.
33392 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33395 // The bitcast source is a direct mmx result.
33396 // Detect bitcasts between i32 to x86mmx
33397 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33398 SDValue BCSrc = Src.getOperand(0);
33399 if (BCSrc.getValueType() == MVT::x86mmx)
33400 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33403 // Try to truncate extended sign bits with PACKSS.
33404 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33407 return combineVectorTruncation(N, DAG, Subtarget);
33410 /// Returns the negated value if the node \p N flips sign of FP value.
33412 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33413 /// AVX512F does not have FXOR, so FNEG is lowered as
33414 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33415 /// In this case we go though all bitcasts.
33416 static SDValue isFNEG(SDNode *N) {
33417 if (N->getOpcode() == ISD::FNEG)
33418 return N->getOperand(0);
33420 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33421 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33424 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33425 if (!Op1.getValueType().isFloatingPoint())
33428 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33430 unsigned EltBits = Op1.getScalarValueSizeInBits();
33431 auto isSignMask = [&](const ConstantFP *C) {
33432 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33435 // There is more than one way to represent the same constant on
33436 // the different X86 targets. The type of the node may also depend on size.
33437 // - load scalar value and broadcast
33438 // - BUILD_VECTOR node
33439 // - load from a constant pool.
33440 // We check all variants here.
33441 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33442 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33443 if (isSignMask(cast<ConstantFP>(C)))
33446 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33447 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33448 if (isSignMask(CN->getConstantFPValue()))
33451 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33452 if (C->getType()->isVectorTy()) {
33453 if (auto *SplatV = C->getSplatValue())
33454 if (isSignMask(cast<ConstantFP>(SplatV)))
33456 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33457 if (isSignMask(FPConst))
33463 /// Do target-specific dag combines on floating point negations.
33464 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33465 const X86Subtarget &Subtarget) {
33466 EVT OrigVT = N->getValueType(0);
33467 SDValue Arg = isFNEG(N);
33468 assert(Arg.getNode() && "N is expected to be an FNEG node");
33470 EVT VT = Arg.getValueType();
33471 EVT SVT = VT.getScalarType();
33474 // Let legalize expand this if it isn't a legal type yet.
33475 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33478 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33479 // use of a constant by performing (-0 - A*B) instead.
33480 // FIXME: Check rounding control flags as well once it becomes available.
33481 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33482 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33483 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33484 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33485 Arg.getOperand(1), Zero);
33486 return DAG.getBitcast(OrigVT, NewNode);
33489 // If we're negating an FMA node, then we can adjust the
33490 // instruction to include the extra negation.
33491 unsigned NewOpcode = 0;
33492 if (Arg.hasOneUse()) {
33493 switch (Arg.getOpcode()) {
33494 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33495 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33496 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33497 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33498 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33499 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33500 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33501 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33502 // We can't handle scalar intrinsic node here because it would only
33503 // invert one element and not the whole vector. But we could try to handle
33504 // a negation of the lower element only.
33508 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33509 Arg.getNode()->ops()));
33514 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33515 const X86Subtarget &Subtarget) {
33516 MVT VT = N->getSimpleValueType(0);
33517 // If we have integer vector types available, use the integer opcodes.
33518 if (VT.isVector() && Subtarget.hasSSE2()) {
33521 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33523 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33524 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33525 unsigned IntOpcode;
33526 switch (N->getOpcode()) {
33527 default: llvm_unreachable("Unexpected FP logic op");
33528 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33529 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33530 case X86ISD::FAND: IntOpcode = ISD::AND; break;
33531 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33533 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33534 return DAG.getBitcast(VT, IntOp);
33539 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33540 TargetLowering::DAGCombinerInfo &DCI,
33541 const X86Subtarget &Subtarget) {
33542 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33545 if (DCI.isBeforeLegalizeOps())
33548 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33551 if (Subtarget.hasCMov())
33552 if (SDValue RV = combineIntegerAbs(N, DAG))
33555 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33559 return combineFneg(N, DAG, Subtarget);
33564 static bool isNullFPScalarOrVectorConst(SDValue V) {
33565 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33568 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33569 /// undefined elements), return a zero constant that may be used to fold away
33570 /// that value. In the case of a vector, the returned constant will not contain
33571 /// undefined elements even if the input parameter does. This makes it suitable
33572 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33573 /// an undef should not propagate.
33574 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33575 const X86Subtarget &Subtarget) {
33576 if (!isNullFPScalarOrVectorConst(V))
33579 if (V.getValueType().isVector())
33580 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33585 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33586 const X86Subtarget &Subtarget) {
33587 SDValue N0 = N->getOperand(0);
33588 SDValue N1 = N->getOperand(1);
33589 EVT VT = N->getValueType(0);
33592 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33593 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33594 (VT == MVT::f64 && Subtarget.hasSSE2())))
33597 auto isAllOnesConstantFP = [](SDValue V) {
33598 auto *C = dyn_cast<ConstantFPSDNode>(V);
33599 return C && C->getConstantFPValue()->isAllOnesValue();
33602 // fand (fxor X, -1), Y --> fandn X, Y
33603 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33604 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33606 // fand X, (fxor Y, -1) --> fandn Y, X
33607 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33608 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33613 /// Do target-specific dag combines on X86ISD::FAND nodes.
33614 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33615 const X86Subtarget &Subtarget) {
33616 // FAND(0.0, x) -> 0.0
33617 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33620 // FAND(x, 0.0) -> 0.0
33621 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33624 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33627 return lowerX86FPLogicOp(N, DAG, Subtarget);
33630 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33631 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33632 const X86Subtarget &Subtarget) {
33633 // FANDN(0.0, x) -> x
33634 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33635 return N->getOperand(1);
33637 // FANDN(x, 0.0) -> 0.0
33638 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33641 return lowerX86FPLogicOp(N, DAG, Subtarget);
33644 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33645 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33646 const X86Subtarget &Subtarget) {
33647 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33649 // F[X]OR(0.0, x) -> x
33650 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33651 return N->getOperand(1);
33653 // F[X]OR(x, 0.0) -> x
33654 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33655 return N->getOperand(0);
33658 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33661 return lowerX86FPLogicOp(N, DAG, Subtarget);
33664 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33665 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33666 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33668 // Only perform optimizations if UnsafeMath is used.
33669 if (!DAG.getTarget().Options.UnsafeFPMath)
33672 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33673 // into FMINC and FMAXC, which are Commutative operations.
33674 unsigned NewOp = 0;
33675 switch (N->getOpcode()) {
33676 default: llvm_unreachable("unknown opcode");
33677 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
33678 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
33681 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33682 N->getOperand(0), N->getOperand(1));
33685 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33686 const X86Subtarget &Subtarget) {
33687 if (Subtarget.useSoftFloat())
33690 // TODO: Check for global or instruction-level "nnan". In that case, we
33691 // should be able to lower to FMAX/FMIN alone.
33692 // TODO: If an operand is already known to be a NaN or not a NaN, this
33693 // should be an optional swap and FMAX/FMIN.
33695 EVT VT = N->getValueType(0);
33696 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33697 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33698 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33701 // This takes at least 3 instructions, so favor a library call when operating
33702 // on a scalar and minimizing code size.
33703 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33706 SDValue Op0 = N->getOperand(0);
33707 SDValue Op1 = N->getOperand(1);
33709 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33710 DAG.getDataLayout(), *DAG.getContext(), VT);
33712 // There are 4 possibilities involving NaN inputs, and these are the required
33716 // ----------------
33717 // Num | Max | Op0 |
33718 // Op0 ----------------
33719 // NaN | Op1 | NaN |
33720 // ----------------
33722 // The SSE FP max/min instructions were not designed for this case, but rather
33724 // Min = Op1 < Op0 ? Op1 : Op0
33725 // Max = Op1 > Op0 ? Op1 : Op0
33727 // So they always return Op0 if either input is a NaN. However, we can still
33728 // use those instructions for fmaxnum by selecting away a NaN input.
33730 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33731 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33732 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33733 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33735 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33736 // are NaN, the NaN value of Op1 is the result.
33737 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
33738 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
33741 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33742 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33743 TargetLowering::DAGCombinerInfo &DCI,
33744 const X86Subtarget &Subtarget) {
33745 // ANDNP(0, x) -> x
33746 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33747 return N->getOperand(1);
33749 // ANDNP(x, 0) -> 0
33750 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33751 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33753 EVT VT = N->getValueType(0);
33755 // Attempt to recursively combine a bitmask ANDNP with shuffles.
33756 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33758 SmallVector<int, 1> NonceMask; // Just a placeholder.
33759 NonceMask.push_back(0);
33760 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33761 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33763 return SDValue(); // This routine will use CombineTo to replace N.
33769 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33770 TargetLowering::DAGCombinerInfo &DCI) {
33771 // BT ignores high bits in the bit index operand.
33772 SDValue Op1 = N->getOperand(1);
33773 if (Op1.hasOneUse()) {
33774 unsigned BitWidth = Op1.getValueSizeInBits();
33775 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33776 APInt KnownZero, KnownOne;
33777 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33778 !DCI.isBeforeLegalizeOps());
33779 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33780 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
33781 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
33782 DCI.CommitTargetLoweringOpt(TLO);
33787 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33788 const X86Subtarget &Subtarget) {
33789 EVT VT = N->getValueType(0);
33790 if (!VT.isVector())
33793 SDValue N0 = N->getOperand(0);
33794 SDValue N1 = N->getOperand(1);
33795 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33798 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33799 // both SSE and AVX2 since there is no sign-extended shift right
33800 // operation on a vector with 64-bit elements.
33801 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33802 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33803 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33804 N0.getOpcode() == ISD::SIGN_EXTEND)) {
33805 SDValue N00 = N0.getOperand(0);
33807 // EXTLOAD has a better solution on AVX2,
33808 // it may be replaced with X86ISD::VSEXT node.
33809 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33810 if (!ISD::isNormalLoad(N00.getNode()))
33813 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33814 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33816 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33822 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33823 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33824 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33825 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33826 /// mode. This can eliminate extend, add, and shift instructions.
33827 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33828 const X86Subtarget &Subtarget) {
33829 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33830 Ext->getOpcode() != ISD::ZERO_EXTEND)
33833 // TODO: This should be valid for other integer types.
33834 EVT VT = Ext->getValueType(0);
33835 if (VT != MVT::i64)
33838 SDValue Add = Ext->getOperand(0);
33839 if (Add.getOpcode() != ISD::ADD)
33842 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33843 bool NSW = Add->getFlags()->hasNoSignedWrap();
33844 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
33846 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33848 if ((Sext && !NSW) || (!Sext && !NUW))
33851 // Having a constant operand to the 'add' ensures that we are not increasing
33852 // the instruction count because the constant is extended for free below.
33853 // A constant operand can also become the displacement field of an LEA.
33854 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33858 // Don't make the 'add' bigger if there's no hope of combining it with some
33859 // other 'add' or 'shl' instruction.
33860 // TODO: It may be profitable to generate simpler LEA instructions in place
33861 // of single 'add' instructions, but the cost model for selecting an LEA
33862 // currently has a high threshold.
33863 bool HasLEAPotential = false;
33864 for (auto *User : Ext->uses()) {
33865 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33866 HasLEAPotential = true;
33870 if (!HasLEAPotential)
33873 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33874 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33875 SDValue AddOp0 = Add.getOperand(0);
33876 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33877 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33879 // The wider add is guaranteed to not wrap because both operands are
33882 Flags.setNoSignedWrap(NSW);
33883 Flags.setNoUnsignedWrap(NUW);
33884 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
33887 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33888 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33889 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33890 /// extends from AH (which we otherwise need to do contortions to access).
33891 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33892 SDValue N0 = N->getOperand(0);
33893 auto OpcodeN = N->getOpcode();
33894 auto OpcodeN0 = N0.getOpcode();
33895 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33896 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33899 EVT VT = N->getValueType(0);
33900 EVT InVT = N0.getValueType();
33901 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33904 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33905 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33906 : X86ISD::UDIVREM8_ZEXT_HREG;
33907 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33909 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33910 return R.getValue(1);
33913 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33914 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33915 /// with UNDEFs) of the input to vectors of the same size as the target type
33916 /// which then extends the lowest elements.
33917 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33918 TargetLowering::DAGCombinerInfo &DCI,
33919 const X86Subtarget &Subtarget) {
33920 unsigned Opcode = N->getOpcode();
33921 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33923 if (!DCI.isBeforeLegalizeOps())
33925 if (!Subtarget.hasSSE2())
33928 SDValue N0 = N->getOperand(0);
33929 EVT VT = N->getValueType(0);
33930 EVT SVT = VT.getScalarType();
33931 EVT InVT = N0.getValueType();
33932 EVT InSVT = InVT.getScalarType();
33934 // Input type must be a vector and we must be extending legal integer types.
33935 if (!VT.isVector())
33937 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33939 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33942 // On AVX2+ targets, if the input/output types are both legal then we will be
33943 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
33944 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
33945 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
33950 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
33951 EVT InVT = N.getValueType();
33952 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
33953 Size / InVT.getScalarSizeInBits());
33954 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
33955 DAG.getUNDEF(InVT));
33957 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
33960 // If target-size is less than 128-bits, extend to a type that would extend
33961 // to 128 bits, extend that and extract the original target vector.
33962 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
33963 unsigned Scale = 128 / VT.getSizeInBits();
33965 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
33966 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
33967 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
33968 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
33969 DAG.getIntPtrConstant(0, DL));
33972 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
33973 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
33974 // Also use this if we don't have SSE41 to allow the legalizer do its job.
33975 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
33976 (VT.is256BitVector() && Subtarget.hasInt256()) ||
33977 (VT.is512BitVector() && Subtarget.hasAVX512())) {
33978 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
33979 return Opcode == ISD::SIGN_EXTEND
33980 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
33981 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
33984 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
33985 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
33986 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
33987 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
33988 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33990 SmallVector<SDValue, 8> Opnds;
33991 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
33992 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
33993 DAG.getIntPtrConstant(Offset, DL));
33994 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
33995 SrcVec = Opcode == ISD::SIGN_EXTEND
33996 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
33997 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
33998 Opnds.push_back(SrcVec);
34000 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34003 // On pre-AVX2 targets, split into 128-bit nodes of
34004 // ISD::*_EXTEND_VECTOR_INREG.
34005 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34006 return SplitAndExtendInReg(128);
34008 // On pre-AVX512 targets, split into 256-bit nodes of
34009 // ISD::*_EXTEND_VECTOR_INREG.
34010 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34011 return SplitAndExtendInReg(256);
34016 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34017 TargetLowering::DAGCombinerInfo &DCI,
34018 const X86Subtarget &Subtarget) {
34019 SDValue N0 = N->getOperand(0);
34020 EVT VT = N->getValueType(0);
34021 EVT InVT = N0.getValueType();
34024 if (SDValue DivRem8 = getDivRem8(N, DAG))
34027 if (!DCI.isBeforeLegalizeOps()) {
34028 if (InVT == MVT::i1) {
34029 SDValue Zero = DAG.getConstant(0, DL, VT);
34030 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34031 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
34036 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34037 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34038 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34039 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34040 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34041 // sext (xor Bool, -1) --> sub (zext Bool), 1
34042 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34043 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34046 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34049 if (Subtarget.hasAVX() && VT.is256BitVector())
34050 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34053 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34059 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34060 const X86Subtarget &Subtarget) {
34062 EVT VT = N->getValueType(0);
34064 // Let legalize expand this if it isn't a legal type yet.
34065 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34068 EVT ScalarVT = VT.getScalarType();
34069 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34072 SDValue A = N->getOperand(0);
34073 SDValue B = N->getOperand(1);
34074 SDValue C = N->getOperand(2);
34076 auto invertIfNegative = [](SDValue &V) {
34077 if (SDValue NegVal = isFNEG(V.getNode())) {
34084 // Do not convert the passthru input of scalar intrinsics.
34085 // FIXME: We could allow negations of the lower element only.
34086 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34087 bool NegB = invertIfNegative(B);
34088 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34090 // Negative multiplication when NegA xor NegB
34091 bool NegMul = (NegA != NegB);
34093 unsigned NewOpcode;
34095 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34097 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34100 if (N->getOpcode() == X86ISD::FMADD_RND) {
34101 switch (NewOpcode) {
34102 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34103 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34104 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34105 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34107 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34108 switch (NewOpcode) {
34109 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34110 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34111 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34112 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34114 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34115 switch (NewOpcode) {
34116 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34117 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34118 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34119 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34122 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34123 "Unexpected opcode!");
34124 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34127 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34130 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34131 TargetLowering::DAGCombinerInfo &DCI,
34132 const X86Subtarget &Subtarget) {
34133 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34134 // (and (i32 x86isd::setcc_carry), 1)
34135 // This eliminates the zext. This transformation is necessary because
34136 // ISD::SETCC is always legalized to i8.
34138 SDValue N0 = N->getOperand(0);
34139 EVT VT = N->getValueType(0);
34141 if (N0.getOpcode() == ISD::AND &&
34143 N0.getOperand(0).hasOneUse()) {
34144 SDValue N00 = N0.getOperand(0);
34145 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34146 if (!isOneConstant(N0.getOperand(1)))
34148 return DAG.getNode(ISD::AND, dl, VT,
34149 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34150 N00.getOperand(0), N00.getOperand(1)),
34151 DAG.getConstant(1, dl, VT));
34155 if (N0.getOpcode() == ISD::TRUNCATE &&
34157 N0.getOperand(0).hasOneUse()) {
34158 SDValue N00 = N0.getOperand(0);
34159 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34160 return DAG.getNode(ISD::AND, dl, VT,
34161 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34162 N00.getOperand(0), N00.getOperand(1)),
34163 DAG.getConstant(1, dl, VT));
34167 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34170 if (VT.is256BitVector())
34171 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34174 if (SDValue DivRem8 = getDivRem8(N, DAG))
34177 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34180 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34186 /// Try to map a 128-bit or larger integer comparison to vector instructions
34187 /// before type legalization splits it up into chunks.
34188 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34189 const X86Subtarget &Subtarget) {
34190 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34191 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34193 // We're looking for an oversized integer equality comparison, but ignore a
34194 // comparison with zero because that gets special treatment in EmitTest().
34195 SDValue X = SetCC->getOperand(0);
34196 SDValue Y = SetCC->getOperand(1);
34197 EVT OpVT = X.getValueType();
34198 unsigned OpSize = OpVT.getSizeInBits();
34199 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34202 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34203 // TODO: Add support for AVX-512.
34204 EVT VT = SetCC->getValueType(0);
34206 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34207 (OpSize == 256 && Subtarget.hasAVX2())) {
34208 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34209 SDValue VecX = DAG.getBitcast(VecVT, X);
34210 SDValue VecY = DAG.getBitcast(VecVT, Y);
34212 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34213 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34214 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34215 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34216 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34217 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34218 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34219 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34221 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34227 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34228 const X86Subtarget &Subtarget) {
34229 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34230 SDValue LHS = N->getOperand(0);
34231 SDValue RHS = N->getOperand(1);
34232 EVT VT = N->getValueType(0);
34235 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34236 EVT OpVT = LHS.getValueType();
34237 // 0-x == y --> x+y == 0
34238 // 0-x != y --> x+y != 0
34239 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34241 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34242 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34244 // x == 0-y --> x+y == 0
34245 // x != 0-y --> x+y != 0
34246 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34248 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34249 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34252 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34256 if (VT.getScalarType() == MVT::i1 &&
34257 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34259 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34260 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34261 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34263 if (!IsSEXT0 || !IsVZero1) {
34264 // Swap the operands and update the condition code.
34265 std::swap(LHS, RHS);
34266 CC = ISD::getSetCCSwappedOperands(CC);
34268 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34269 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34270 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34273 if (IsSEXT0 && IsVZero1) {
34274 assert(VT == LHS.getOperand(0).getValueType() &&
34275 "Uexpected operand type");
34276 if (CC == ISD::SETGT)
34277 return DAG.getConstant(0, DL, VT);
34278 if (CC == ISD::SETLE)
34279 return DAG.getConstant(1, DL, VT);
34280 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34281 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34283 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34284 "Unexpected condition code!");
34285 return LHS.getOperand(0);
34289 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34290 // to avoid scalarization via legalization because v4i32 is not a legal type.
34291 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34292 LHS.getValueType() == MVT::v4f32)
34293 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34298 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34300 // Gather and Scatter instructions use k-registers for masks. The type of
34301 // the masks is v*i1. So the mask will be truncated anyway.
34302 // The SIGN_EXTEND_INREG my be dropped.
34303 SDValue Mask = N->getOperand(2);
34304 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34305 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34306 NewOps[2] = Mask.getOperand(0);
34307 DAG.UpdateNodeOperands(N, NewOps);
34312 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34313 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34314 const X86Subtarget &Subtarget) {
34316 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34317 SDValue EFLAGS = N->getOperand(1);
34319 // Try to simplify the EFLAGS and condition code operands.
34320 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34321 return getSETCC(CC, Flags, DL, DAG);
34326 /// Optimize branch condition evaluation.
34327 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34328 const X86Subtarget &Subtarget) {
34330 SDValue EFLAGS = N->getOperand(3);
34331 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34333 // Try to simplify the EFLAGS and condition code operands.
34334 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34335 // RAUW them under us.
34336 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34337 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34338 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34339 N->getOperand(1), Cond, Flags);
34345 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34346 SelectionDAG &DAG) {
34347 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34348 // optimize away operation when it's from a constant.
34350 // The general transformation is:
34351 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34352 // AND(VECTOR_CMP(x,y), constant2)
34353 // constant2 = UNARYOP(constant)
34355 // Early exit if this isn't a vector operation, the operand of the
34356 // unary operation isn't a bitwise AND, or if the sizes of the operations
34357 // aren't the same.
34358 EVT VT = N->getValueType(0);
34359 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34360 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34361 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34364 // Now check that the other operand of the AND is a constant. We could
34365 // make the transformation for non-constant splats as well, but it's unclear
34366 // that would be a benefit as it would not eliminate any operations, just
34367 // perform one more step in scalar code before moving to the vector unit.
34368 if (BuildVectorSDNode *BV =
34369 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34370 // Bail out if the vector isn't a constant.
34371 if (!BV->isConstant())
34374 // Everything checks out. Build up the new and improved node.
34376 EVT IntVT = BV->getValueType(0);
34377 // Create a new constant of the appropriate type for the transformed
34379 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34380 // The AND node needs bitcasts to/from an integer vector type around it.
34381 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34382 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34383 N->getOperand(0)->getOperand(0), MaskConst);
34384 SDValue Res = DAG.getBitcast(VT, NewAnd);
34391 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34392 const X86Subtarget &Subtarget) {
34393 SDValue Op0 = N->getOperand(0);
34394 EVT VT = N->getValueType(0);
34395 EVT InVT = Op0.getValueType();
34396 EVT InSVT = InVT.getScalarType();
34397 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34399 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34400 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34401 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34403 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34404 InVT.getVectorNumElements());
34405 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34407 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34408 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34410 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34413 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34414 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34415 // the optimization here.
34416 if (DAG.SignBitIsZero(Op0))
34417 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34422 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34423 const X86Subtarget &Subtarget) {
34424 // First try to optimize away the conversion entirely when it's
34425 // conditionally from a constant. Vectors only.
34426 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34429 // Now move on to more general possibilities.
34430 SDValue Op0 = N->getOperand(0);
34431 EVT VT = N->getValueType(0);
34432 EVT InVT = Op0.getValueType();
34433 EVT InSVT = InVT.getScalarType();
34435 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34436 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34437 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34438 if (InVT.isVector() &&
34439 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34440 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34442 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34443 InVT.getVectorNumElements());
34444 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34445 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34448 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34449 // vectors and scalars, see if we know that the upper bits are all the sign
34450 // bit, in which case we can truncate the input to i32 and convert from that.
34451 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34452 unsigned BitWidth = InVT.getScalarSizeInBits();
34453 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34454 if (NumSignBits >= (BitWidth - 31)) {
34455 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34456 if (InVT.isVector())
34457 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34458 InVT.getVectorNumElements());
34460 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34461 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34465 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34466 // a 32-bit target where SSE doesn't support i64->FP operations.
34467 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34468 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34469 EVT LdVT = Ld->getValueType(0);
34471 // This transformation is not supported if the result type is f16 or f128.
34472 if (VT == MVT::f16 || VT == MVT::f128)
34475 if (!Ld->isVolatile() && !VT.isVector() &&
34476 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34477 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34478 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34479 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34480 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34487 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34488 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34489 X86TargetLowering::DAGCombinerInfo &DCI) {
34490 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34491 // the result is either zero or one (depending on the input carry bit).
34492 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34493 if (X86::isZeroNode(N->getOperand(0)) &&
34494 X86::isZeroNode(N->getOperand(1)) &&
34495 // We don't have a good way to replace an EFLAGS use, so only do this when
34497 SDValue(N, 1).use_empty()) {
34499 EVT VT = N->getValueType(0);
34500 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34501 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34502 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34503 DAG.getConstant(X86::COND_B, DL,
34506 DAG.getConstant(1, DL, VT));
34507 return DCI.CombineTo(N, Res1, CarryOut);
34513 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34514 /// which is more useful than 0/1 in some cases.
34515 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34517 // "Condition code B" is also known as "the carry flag" (CF).
34518 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34519 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34520 MVT VT = N->getSimpleValueType(0);
34522 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34524 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34525 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34528 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34529 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34530 /// with CMP+{ADC, SBB}.
34531 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34532 bool IsSub = N->getOpcode() == ISD::SUB;
34533 SDValue X = N->getOperand(0);
34534 SDValue Y = N->getOperand(1);
34536 // If this is an add, canonicalize a zext operand to the RHS.
34537 // TODO: Incomplete? What if both sides are zexts?
34538 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34539 Y.getOpcode() != ISD::ZERO_EXTEND)
34542 // Look through a one-use zext.
34543 bool PeekedThroughZext = false;
34544 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34545 Y = Y.getOperand(0);
34546 PeekedThroughZext = true;
34549 // If this is an add, canonicalize a setcc operand to the RHS.
34550 // TODO: Incomplete? What if both sides are setcc?
34551 // TODO: Should we allow peeking through a zext of the other operand?
34552 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34553 Y.getOpcode() != X86ISD::SETCC)
34556 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34560 EVT VT = N->getValueType(0);
34561 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34563 if (CC == X86::COND_B) {
34564 // X + SETB Z --> X + (mask SBB Z, Z)
34565 // X - SETB Z --> X - (mask SBB Z, Z)
34566 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34567 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34568 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34569 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34570 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34573 if (CC == X86::COND_A) {
34574 SDValue EFLAGS = Y->getOperand(1);
34575 // Try to convert COND_A into COND_B in an attempt to facilitate
34576 // materializing "setb reg".
34578 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34579 // cannot take an immediate as its first operand.
34581 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34582 EFLAGS.getValueType().isInteger() &&
34583 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34584 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34585 EFLAGS.getNode()->getVTList(),
34586 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34587 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34588 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34589 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34590 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34591 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34595 if (CC != X86::COND_E && CC != X86::COND_NE)
34598 SDValue Cmp = Y.getOperand(1);
34599 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34600 !X86::isZeroNode(Cmp.getOperand(1)) ||
34601 !Cmp.getOperand(0).getValueType().isInteger())
34604 // (cmp Z, 1) sets the carry flag if Z is 0.
34605 SDValue Z = Cmp.getOperand(0);
34606 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34607 DAG.getConstant(1, DL, Z.getValueType()));
34609 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34611 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34612 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34613 if (CC == X86::COND_NE)
34614 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34615 DAG.getConstant(-1ULL, DL, VT), NewCmp);
34617 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
34618 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
34619 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34620 DAG.getConstant(0, DL, VT), NewCmp);
34623 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34624 const X86Subtarget &Subtarget) {
34625 SDValue MulOp = N->getOperand(0);
34626 SDValue Phi = N->getOperand(1);
34628 if (MulOp.getOpcode() != ISD::MUL)
34629 std::swap(MulOp, Phi);
34630 if (MulOp.getOpcode() != ISD::MUL)
34634 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34637 EVT VT = N->getValueType(0);
34639 unsigned RegSize = 128;
34640 if (Subtarget.hasBWI())
34642 else if (Subtarget.hasAVX2())
34644 unsigned VectorSize = VT.getVectorNumElements() * 16;
34645 // If the vector size is less than 128, or greater than the supported RegSize,
34646 // do not use PMADD.
34647 if (VectorSize < 128 || VectorSize > RegSize)
34651 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34652 VT.getVectorNumElements());
34653 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34654 VT.getVectorNumElements() / 2);
34656 // Shrink the operands of mul.
34657 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34658 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34660 // Madd vector size is half of the original vector size
34661 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34662 // Fill the rest of the output with 0
34663 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34664 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34665 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34668 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34669 const X86Subtarget &Subtarget) {
34671 EVT VT = N->getValueType(0);
34672 SDValue Op0 = N->getOperand(0);
34673 SDValue Op1 = N->getOperand(1);
34675 // TODO: There's nothing special about i32, any integer type above i16 should
34676 // work just as well.
34677 if (!VT.isVector() || !VT.isSimple() ||
34678 !(VT.getVectorElementType() == MVT::i32))
34681 unsigned RegSize = 128;
34682 if (Subtarget.hasBWI())
34684 else if (Subtarget.hasAVX2())
34687 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34688 // TODO: We should be able to handle larger vectors by splitting them before
34689 // feeding them into several SADs, and then reducing over those.
34690 if (VT.getSizeInBits() / 4 > RegSize)
34693 // We know N is a reduction add, which means one of its operands is a phi.
34694 // To match SAD, we need the other operand to be a vector select.
34695 SDValue SelectOp, Phi;
34696 if (Op0.getOpcode() == ISD::VSELECT) {
34699 } else if (Op1.getOpcode() == ISD::VSELECT) {
34705 // Check whether we have an abs-diff pattern feeding into the select.
34706 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34709 // SAD pattern detected. Now build a SAD instruction and an addition for
34710 // reduction. Note that the number of elements of the result of SAD is less
34711 // than the number of elements of its input. Therefore, we could only update
34712 // part of elements in the reduction vector.
34713 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34715 // The output of PSADBW is a vector of i64.
34716 // We need to turn the vector of i64 into a vector of i32.
34717 // If the reduction vector is at least as wide as the psadbw result, just
34718 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34720 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34721 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34722 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34724 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34726 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34727 // Update part of elements of the reduction vector. This is done by first
34728 // extracting a sub-vector from it, updating this sub-vector, and inserting
34730 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34731 DAG.getIntPtrConstant(0, DL));
34732 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34733 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34734 DAG.getIntPtrConstant(0, DL));
34736 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34739 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34740 const X86Subtarget &Subtarget) {
34741 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
34742 if (Flags->hasVectorReduction()) {
34743 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34745 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34748 EVT VT = N->getValueType(0);
34749 SDValue Op0 = N->getOperand(0);
34750 SDValue Op1 = N->getOperand(1);
34752 // Try to synthesize horizontal adds from adds of shuffles.
34753 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34754 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34755 isHorizontalBinOp(Op0, Op1, true))
34756 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34758 return combineAddOrSubToADCOrSBB(N, DAG);
34761 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34762 const X86Subtarget &Subtarget) {
34763 SDValue Op0 = N->getOperand(0);
34764 SDValue Op1 = N->getOperand(1);
34766 // X86 can't encode an immediate LHS of a sub. See if we can push the
34767 // negation into a preceding instruction.
34768 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34769 // If the RHS of the sub is a XOR with one use and a constant, invert the
34770 // immediate. Then add one to the LHS of the sub so we can turn
34771 // X-Y -> X+~Y+1, saving one register.
34772 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34773 isa<ConstantSDNode>(Op1.getOperand(1))) {
34774 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34775 EVT VT = Op0.getValueType();
34776 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34778 DAG.getConstant(~XorC, SDLoc(Op1), VT));
34779 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34780 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34784 // Try to synthesize horizontal subs from subs of shuffles.
34785 EVT VT = N->getValueType(0);
34786 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34787 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34788 isHorizontalBinOp(Op0, Op1, false))
34789 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34791 return combineAddOrSubToADCOrSBB(N, DAG);
34794 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34795 TargetLowering::DAGCombinerInfo &DCI,
34796 const X86Subtarget &Subtarget) {
34797 if (DCI.isBeforeLegalize())
34801 unsigned Opcode = N->getOpcode();
34802 MVT VT = N->getSimpleValueType(0);
34803 MVT SVT = VT.getVectorElementType();
34804 unsigned NumElts = VT.getVectorNumElements();
34805 unsigned EltSizeInBits = SVT.getSizeInBits();
34807 SDValue Op = N->getOperand(0);
34808 MVT OpVT = Op.getSimpleValueType();
34809 MVT OpEltVT = OpVT.getVectorElementType();
34810 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34811 unsigned InputBits = OpEltSizeInBits * NumElts;
34813 // Perform any constant folding.
34814 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34816 SmallVector<APInt, 64> EltBits;
34817 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34818 APInt Undefs(NumElts, 0);
34819 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34821 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34822 for (unsigned i = 0; i != NumElts; ++i) {
34823 if (UndefElts[i]) {
34827 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34828 : EltBits[i].sextOrTrunc(EltSizeInBits);
34830 return getConstVector(Vals, Undefs, VT, DAG, DL);
34833 // (vzext (bitcast (vzext (x)) -> (vzext x)
34834 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34835 SDValue V = peekThroughBitcasts(Op);
34836 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34837 MVT InnerVT = V.getSimpleValueType();
34838 MVT InnerEltVT = InnerVT.getVectorElementType();
34840 // If the element sizes match exactly, we can just do one larger vzext. This
34841 // is always an exact type match as vzext operates on integer types.
34842 if (OpEltVT == InnerEltVT) {
34843 assert(OpVT == InnerVT && "Types must match for vzext!");
34844 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34847 // The only other way we can combine them is if only a single element of the
34848 // inner vzext is used in the input to the outer vzext.
34849 if (InnerEltVT.getSizeInBits() < InputBits)
34852 // In this case, the inner vzext is completely dead because we're going to
34853 // only look at bits inside of the low element. Just do the outer vzext on
34854 // a bitcast of the input to the inner.
34855 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34858 // Check if we can bypass extracting and re-inserting an element of an input
34859 // vector. Essentially:
34860 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34861 // TODO: Add X86ISD::VSEXT support
34862 if (Opcode == X86ISD::VZEXT &&
34863 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34864 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34865 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34866 SDValue ExtractedV = V.getOperand(0);
34867 SDValue OrigV = ExtractedV.getOperand(0);
34868 if (isNullConstant(ExtractedV.getOperand(1))) {
34869 MVT OrigVT = OrigV.getSimpleValueType();
34870 // Extract a subvector if necessary...
34871 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34872 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34873 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34874 OrigVT.getVectorNumElements() / Ratio);
34875 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34876 DAG.getIntPtrConstant(0, DL));
34878 Op = DAG.getBitcast(OpVT, OrigV);
34879 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34886 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34887 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34888 const X86Subtarget &Subtarget) {
34889 SDValue Chain = N->getOperand(0);
34890 SDValue LHS = N->getOperand(1);
34891 SDValue RHS = N->getOperand(2);
34892 MVT VT = RHS.getSimpleValueType();
34895 auto *C = dyn_cast<ConstantSDNode>(RHS);
34896 if (!C || C->getZExtValue() != 1)
34899 RHS = DAG.getConstant(-1, DL, VT);
34900 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34901 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34902 DAG.getVTList(MVT::i32, MVT::Other),
34903 {Chain, LHS, RHS}, VT, MMO);
34906 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34907 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34908 SDValue Op0 = N->getOperand(0);
34909 SDValue Op1 = N->getOperand(1);
34911 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34914 EVT VT = N->getValueType(0);
34917 return DAG.getNode(X86ISD::TESTM, DL, VT,
34918 Op0->getOperand(0), Op0->getOperand(1));
34921 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
34922 const X86Subtarget &Subtarget) {
34923 MVT VT = N->getSimpleValueType(0);
34926 if (N->getOperand(0) == N->getOperand(1)) {
34927 if (N->getOpcode() == X86ISD::PCMPEQ)
34928 return getOnesVector(VT, DAG, DL);
34929 if (N->getOpcode() == X86ISD::PCMPGT)
34930 return getZeroVector(VT, Subtarget, DAG, DL);
34936 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
34937 TargetLowering::DAGCombinerInfo &DCI,
34938 const X86Subtarget &Subtarget) {
34939 if (DCI.isBeforeLegalizeOps())
34943 SDValue Vec = N->getOperand(0);
34944 SDValue SubVec = N->getOperand(1);
34945 SDValue Idx = N->getOperand(2);
34947 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
34948 MVT OpVT = N->getSimpleValueType(0);
34949 MVT SubVecVT = SubVec.getSimpleValueType();
34951 // If this is an insert of an extract, combine to a shuffle. Don't do this
34952 // if the insert or extract can be represented with a subvector operation.
34953 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
34954 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
34955 (IdxVal != 0 || !Vec.isUndef())) {
34956 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
34957 if (ExtIdxVal != 0) {
34958 int VecNumElts = OpVT.getVectorNumElements();
34959 int SubVecNumElts = SubVecVT.getVectorNumElements();
34960 SmallVector<int, 64> Mask(VecNumElts);
34961 // First create an identity shuffle mask.
34962 for (int i = 0; i != VecNumElts; ++i)
34964 // Now insert the extracted portion.
34965 for (int i = 0; i != SubVecNumElts; ++i)
34966 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
34968 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
34972 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
34974 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34975 // (load16 addr + 16), Elts/2)
34978 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34979 // (load32 addr + 32), Elts/2)
34981 // or a 16-byte or 32-byte broadcast:
34982 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
34983 // (load16 addr), Elts/2)
34984 // --> X86SubVBroadcast(load16 addr)
34986 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
34987 // (load32 addr), Elts/2)
34988 // --> X86SubVBroadcast(load32 addr)
34989 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
34990 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
34991 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
34992 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
34993 if (Idx2 && Idx2->getZExtValue() == 0) {
34994 SDValue SubVec2 = Vec.getOperand(1);
34995 // If needed, look through bitcasts to get to the load.
34996 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
34998 unsigned Alignment = FirstLd->getAlignment();
34999 unsigned AS = FirstLd->getAddressSpace();
35000 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35001 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35002 OpVT, AS, Alignment, &Fast) && Fast) {
35003 SDValue Ops[] = {SubVec2, SubVec};
35004 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35008 // If lower/upper loads are the same and the only users of the load, then
35009 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35010 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35011 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35012 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35013 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35016 // If this is subv_broadcast insert into both halves, use a larger
35018 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35019 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35020 SubVec.getOperand(0));
35029 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35030 DAGCombinerInfo &DCI) const {
35031 SelectionDAG &DAG = DCI.DAG;
35032 switch (N->getOpcode()) {
35034 case ISD::EXTRACT_VECTOR_ELT:
35035 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35036 case X86ISD::PEXTRW:
35037 case X86ISD::PEXTRB:
35038 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35039 case ISD::INSERT_SUBVECTOR:
35040 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35043 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35044 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
35045 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35046 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35047 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35048 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35049 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35052 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35053 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35054 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35055 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35056 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35057 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35058 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35059 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35060 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35061 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35063 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35064 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35065 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35066 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35067 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35068 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35070 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35072 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35074 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35075 case X86ISD::BT: return combineBT(N, DAG, DCI);
35076 case ISD::ANY_EXTEND:
35077 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35078 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35079 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35080 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35081 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35082 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35083 case X86ISD::VSHLI:
35084 case X86ISD::VSRAI:
35085 case X86ISD::VSRLI:
35086 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35087 case ISD::SIGN_EXTEND_VECTOR_INREG:
35088 case ISD::ZERO_EXTEND_VECTOR_INREG:
35089 case X86ISD::VSEXT:
35090 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35091 case X86ISD::PINSRB:
35092 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35093 case X86ISD::SHUFP: // Handle all target specific shuffles
35094 case X86ISD::INSERTPS:
35095 case X86ISD::PALIGNR:
35096 case X86ISD::VSHLDQ:
35097 case X86ISD::VSRLDQ:
35098 case X86ISD::BLENDI:
35099 case X86ISD::UNPCKH:
35100 case X86ISD::UNPCKL:
35101 case X86ISD::MOVHLPS:
35102 case X86ISD::MOVLHPS:
35103 case X86ISD::PSHUFB:
35104 case X86ISD::PSHUFD:
35105 case X86ISD::PSHUFHW:
35106 case X86ISD::PSHUFLW:
35107 case X86ISD::MOVSHDUP:
35108 case X86ISD::MOVSLDUP:
35109 case X86ISD::MOVDDUP:
35110 case X86ISD::MOVSS:
35111 case X86ISD::MOVSD:
35112 case X86ISD::VPPERM:
35113 case X86ISD::VPERMI:
35114 case X86ISD::VPERMV:
35115 case X86ISD::VPERMV3:
35116 case X86ISD::VPERMIV3:
35117 case X86ISD::VPERMIL2:
35118 case X86ISD::VPERMILPI:
35119 case X86ISD::VPERMILPV:
35120 case X86ISD::VPERM2X128:
35121 case X86ISD::VZEXT_MOVL:
35122 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35123 case X86ISD::FMADD:
35124 case X86ISD::FMADD_RND:
35125 case X86ISD::FMADDS1_RND:
35126 case X86ISD::FMADDS3_RND:
35127 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35129 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35130 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35131 case X86ISD::TESTM: return combineTestM(N, DAG);
35132 case X86ISD::PCMPEQ:
35133 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35139 /// Return true if the target has native support for the specified value type
35140 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35141 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35142 /// some i16 instructions are slow.
35143 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35144 if (!isTypeLegal(VT))
35146 if (VT != MVT::i16)
35153 case ISD::SIGN_EXTEND:
35154 case ISD::ZERO_EXTEND:
35155 case ISD::ANY_EXTEND:
35168 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35169 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35170 /// we don't adjust the stack we clobber the first frame index.
35171 /// See X86InstrInfo::copyPhysReg.
35172 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
35173 MachineFunction *MF) const {
35174 const MachineRegisterInfo &MRI = MF->getRegInfo();
35176 return any_of(MRI.reg_instructions(X86::EFLAGS),
35177 [](const MachineInstr &RI) { return RI.isCopy(); });
35180 /// This method query the target whether it is beneficial for dag combiner to
35181 /// promote the specified node. If true, it should return the desired promotion
35182 /// type by reference.
35183 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35184 EVT VT = Op.getValueType();
35185 if (VT != MVT::i16)
35188 bool Promote = false;
35189 bool Commute = false;
35190 switch (Op.getOpcode()) {
35192 case ISD::SIGN_EXTEND:
35193 case ISD::ZERO_EXTEND:
35194 case ISD::ANY_EXTEND:
35199 SDValue N0 = Op.getOperand(0);
35200 // Look out for (store (shl (load), x)).
35201 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35214 SDValue N0 = Op.getOperand(0);
35215 SDValue N1 = Op.getOperand(1);
35216 if (!Commute && MayFoldLoad(N1))
35218 // Avoid disabling potential load folding opportunities.
35219 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35221 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35231 //===----------------------------------------------------------------------===//
35232 // X86 Inline Assembly Support
35233 //===----------------------------------------------------------------------===//
35235 // Helper to match a string separated by whitespace.
35236 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35237 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35239 for (StringRef Piece : Pieces) {
35240 if (!S.startswith(Piece)) // Check if the piece matches.
35243 S = S.substr(Piece.size());
35244 StringRef::size_type Pos = S.find_first_not_of(" \t");
35245 if (Pos == 0) // We matched a prefix.
35254 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35256 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35257 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35258 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35259 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35261 if (AsmPieces.size() == 3)
35263 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35270 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35271 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35273 const std::string &AsmStr = IA->getAsmString();
35275 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35276 if (!Ty || Ty->getBitWidth() % 16 != 0)
35279 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35280 SmallVector<StringRef, 4> AsmPieces;
35281 SplitString(AsmStr, AsmPieces, ";\n");
35283 switch (AsmPieces.size()) {
35284 default: return false;
35286 // FIXME: this should verify that we are targeting a 486 or better. If not,
35287 // we will turn this bswap into something that will be lowered to logical
35288 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35289 // lower so don't worry about this.
35291 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35292 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35293 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35294 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35295 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35296 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35297 // No need to check constraints, nothing other than the equivalent of
35298 // "=r,0" would be valid here.
35299 return IntrinsicLowering::LowerToByteSwap(CI);
35302 // rorw $$8, ${0:w} --> llvm.bswap.i16
35303 if (CI->getType()->isIntegerTy(16) &&
35304 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35305 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35306 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35308 StringRef ConstraintsStr = IA->getConstraintString();
35309 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35310 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35311 if (clobbersFlagRegisters(AsmPieces))
35312 return IntrinsicLowering::LowerToByteSwap(CI);
35316 if (CI->getType()->isIntegerTy(32) &&
35317 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35318 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35319 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35320 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35322 StringRef ConstraintsStr = IA->getConstraintString();
35323 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35324 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35325 if (clobbersFlagRegisters(AsmPieces))
35326 return IntrinsicLowering::LowerToByteSwap(CI);
35329 if (CI->getType()->isIntegerTy(64)) {
35330 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35331 if (Constraints.size() >= 2 &&
35332 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35333 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35334 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35335 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35336 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35337 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35338 return IntrinsicLowering::LowerToByteSwap(CI);
35346 /// Given a constraint letter, return the type of constraint for this target.
35347 X86TargetLowering::ConstraintType
35348 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35349 if (Constraint.size() == 1) {
35350 switch (Constraint[0]) {
35362 return C_RegisterClass;
35363 case 'k': // AVX512 masking registers.
35387 else if (Constraint.size() == 2) {
35388 switch (Constraint[0]) {
35392 switch (Constraint[1]) {
35400 return TargetLowering::getConstraintType(Constraint);
35403 /// Examine constraint type and operand type and determine a weight value.
35404 /// This object must already have been set up with the operand type
35405 /// and the current alternative constraint selected.
35406 TargetLowering::ConstraintWeight
35407 X86TargetLowering::getSingleConstraintMatchWeight(
35408 AsmOperandInfo &info, const char *constraint) const {
35409 ConstraintWeight weight = CW_Invalid;
35410 Value *CallOperandVal = info.CallOperandVal;
35411 // If we don't have a value, we can't do a match,
35412 // but allow it at the lowest weight.
35413 if (!CallOperandVal)
35415 Type *type = CallOperandVal->getType();
35416 // Look at the constraint type.
35417 switch (*constraint) {
35419 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35430 if (CallOperandVal->getType()->isIntegerTy())
35431 weight = CW_SpecificReg;
35436 if (type->isFloatingPointTy())
35437 weight = CW_SpecificReg;
35440 if (type->isX86_MMXTy() && Subtarget.hasMMX())
35441 weight = CW_SpecificReg;
35444 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35445 if (constraint[1] == 'k') {
35446 // Support for 'Yk' (similarly to the 'k' variant below).
35447 weight = CW_SpecificReg;
35450 // Else fall through (handle "Y" constraint).
35453 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35454 weight = CW_Register;
35457 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35458 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35459 weight = CW_Register;
35462 // Enable conditional vector operations using %k<#> registers.
35463 weight = CW_SpecificReg;
35466 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35467 if (C->getZExtValue() <= 31)
35468 weight = CW_Constant;
35472 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35473 if (C->getZExtValue() <= 63)
35474 weight = CW_Constant;
35478 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35479 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35480 weight = CW_Constant;
35484 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35485 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35486 weight = CW_Constant;
35490 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35491 if (C->getZExtValue() <= 3)
35492 weight = CW_Constant;
35496 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35497 if (C->getZExtValue() <= 0xff)
35498 weight = CW_Constant;
35503 if (isa<ConstantFP>(CallOperandVal)) {
35504 weight = CW_Constant;
35508 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35509 if ((C->getSExtValue() >= -0x80000000LL) &&
35510 (C->getSExtValue() <= 0x7fffffffLL))
35511 weight = CW_Constant;
35515 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35516 if (C->getZExtValue() <= 0xffffffff)
35517 weight = CW_Constant;
35524 /// Try to replace an X constraint, which matches anything, with another that
35525 /// has more specific requirements based on the type of the corresponding
35527 const char *X86TargetLowering::
35528 LowerXConstraint(EVT ConstraintVT) const {
35529 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35530 // 'f' like normal targets.
35531 if (ConstraintVT.isFloatingPoint()) {
35532 if (Subtarget.hasSSE2())
35534 if (Subtarget.hasSSE1())
35538 return TargetLowering::LowerXConstraint(ConstraintVT);
35541 /// Lower the specified operand into the Ops vector.
35542 /// If it is invalid, don't add anything to Ops.
35543 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35544 std::string &Constraint,
35545 std::vector<SDValue>&Ops,
35546 SelectionDAG &DAG) const {
35549 // Only support length 1 constraints for now.
35550 if (Constraint.length() > 1) return;
35552 char ConstraintLetter = Constraint[0];
35553 switch (ConstraintLetter) {
35556 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35557 if (C->getZExtValue() <= 31) {
35558 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35559 Op.getValueType());
35565 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35566 if (C->getZExtValue() <= 63) {
35567 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35568 Op.getValueType());
35574 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35575 if (isInt<8>(C->getSExtValue())) {
35576 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35577 Op.getValueType());
35583 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35584 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35585 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35586 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35587 Op.getValueType());
35593 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35594 if (C->getZExtValue() <= 3) {
35595 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35596 Op.getValueType());
35602 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35603 if (C->getZExtValue() <= 255) {
35604 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35605 Op.getValueType());
35611 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35612 if (C->getZExtValue() <= 127) {
35613 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35614 Op.getValueType());
35620 // 32-bit signed value
35621 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35622 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35623 C->getSExtValue())) {
35624 // Widen to 64 bits here to get it sign extended.
35625 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35628 // FIXME gcc accepts some relocatable values here too, but only in certain
35629 // memory models; it's complicated.
35634 // 32-bit unsigned value
35635 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35636 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35637 C->getZExtValue())) {
35638 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35639 Op.getValueType());
35643 // FIXME gcc accepts some relocatable values here too, but only in certain
35644 // memory models; it's complicated.
35648 // Literal immediates are always ok.
35649 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35650 // Widen to 64 bits here to get it sign extended.
35651 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35655 // In any sort of PIC mode addresses need to be computed at runtime by
35656 // adding in a register or some sort of table lookup. These can't
35657 // be used as immediates.
35658 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35661 // If we are in non-pic codegen mode, we allow the address of a global (with
35662 // an optional displacement) to be used with 'i'.
35663 GlobalAddressSDNode *GA = nullptr;
35664 int64_t Offset = 0;
35666 // Match either (GA), (GA+C), (GA+C1+C2), etc.
35668 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35669 Offset += GA->getOffset();
35671 } else if (Op.getOpcode() == ISD::ADD) {
35672 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35673 Offset += C->getZExtValue();
35674 Op = Op.getOperand(0);
35677 } else if (Op.getOpcode() == ISD::SUB) {
35678 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35679 Offset += -C->getZExtValue();
35680 Op = Op.getOperand(0);
35685 // Otherwise, this isn't something we can handle, reject it.
35689 const GlobalValue *GV = GA->getGlobal();
35690 // If we require an extra load to get this address, as in PIC mode, we
35691 // can't accept it.
35692 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35695 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35696 GA->getValueType(0), Offset);
35701 if (Result.getNode()) {
35702 Ops.push_back(Result);
35705 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35708 /// Check if \p RC is a general purpose register class.
35709 /// I.e., GR* or one of their variant.
35710 static bool isGRClass(const TargetRegisterClass &RC) {
35711 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35712 RC.hasSuperClassEq(&X86::GR16RegClass) ||
35713 RC.hasSuperClassEq(&X86::GR32RegClass) ||
35714 RC.hasSuperClassEq(&X86::GR64RegClass) ||
35715 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35718 /// Check if \p RC is a vector register class.
35719 /// I.e., FR* / VR* or one of their variant.
35720 static bool isFRClass(const TargetRegisterClass &RC) {
35721 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35722 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35723 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35724 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35725 RC.hasSuperClassEq(&X86::VR512RegClass);
35728 std::pair<unsigned, const TargetRegisterClass *>
35729 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35730 StringRef Constraint,
35732 // First, see if this is a constraint that directly corresponds to an LLVM
35734 if (Constraint.size() == 1) {
35735 // GCC Constraint Letters
35736 switch (Constraint[0]) {
35738 // TODO: Slight differences here in allocation order and leaving
35739 // RIP in the class. Do they matter any more here than they do
35740 // in the normal allocation?
35742 if (Subtarget.hasAVX512()) {
35743 // Only supported in AVX512 or later.
35744 switch (VT.SimpleTy) {
35747 return std::make_pair(0U, &X86::VK32RegClass);
35749 return std::make_pair(0U, &X86::VK16RegClass);
35751 return std::make_pair(0U, &X86::VK8RegClass);
35753 return std::make_pair(0U, &X86::VK1RegClass);
35755 return std::make_pair(0U, &X86::VK64RegClass);
35759 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35760 if (Subtarget.is64Bit()) {
35761 if (VT == MVT::i32 || VT == MVT::f32)
35762 return std::make_pair(0U, &X86::GR32RegClass);
35763 if (VT == MVT::i16)
35764 return std::make_pair(0U, &X86::GR16RegClass);
35765 if (VT == MVT::i8 || VT == MVT::i1)
35766 return std::make_pair(0U, &X86::GR8RegClass);
35767 if (VT == MVT::i64 || VT == MVT::f64)
35768 return std::make_pair(0U, &X86::GR64RegClass);
35771 // 32-bit fallthrough
35772 case 'Q': // Q_REGS
35773 if (VT == MVT::i32 || VT == MVT::f32)
35774 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35775 if (VT == MVT::i16)
35776 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35777 if (VT == MVT::i8 || VT == MVT::i1)
35778 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35779 if (VT == MVT::i64)
35780 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35782 case 'r': // GENERAL_REGS
35783 case 'l': // INDEX_REGS
35784 if (VT == MVT::i8 || VT == MVT::i1)
35785 return std::make_pair(0U, &X86::GR8RegClass);
35786 if (VT == MVT::i16)
35787 return std::make_pair(0U, &X86::GR16RegClass);
35788 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35789 return std::make_pair(0U, &X86::GR32RegClass);
35790 return std::make_pair(0U, &X86::GR64RegClass);
35791 case 'R': // LEGACY_REGS
35792 if (VT == MVT::i8 || VT == MVT::i1)
35793 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35794 if (VT == MVT::i16)
35795 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35796 if (VT == MVT::i32 || !Subtarget.is64Bit())
35797 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35798 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35799 case 'f': // FP Stack registers.
35800 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35801 // value to the correct fpstack register class.
35802 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35803 return std::make_pair(0U, &X86::RFP32RegClass);
35804 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35805 return std::make_pair(0U, &X86::RFP64RegClass);
35806 return std::make_pair(0U, &X86::RFP80RegClass);
35807 case 'y': // MMX_REGS if MMX allowed.
35808 if (!Subtarget.hasMMX()) break;
35809 return std::make_pair(0U, &X86::VR64RegClass);
35810 case 'Y': // SSE_REGS if SSE2 allowed
35811 if (!Subtarget.hasSSE2()) break;
35814 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35815 if (!Subtarget.hasSSE1()) break;
35816 bool VConstraint = (Constraint[0] == 'v');
35818 switch (VT.SimpleTy) {
35820 // Scalar SSE types.
35823 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35824 return std::make_pair(0U, &X86::FR32XRegClass);
35825 return std::make_pair(0U, &X86::FR32RegClass);
35828 if (VConstraint && Subtarget.hasVLX())
35829 return std::make_pair(0U, &X86::FR64XRegClass);
35830 return std::make_pair(0U, &X86::FR64RegClass);
35831 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35839 if (VConstraint && Subtarget.hasVLX())
35840 return std::make_pair(0U, &X86::VR128XRegClass);
35841 return std::make_pair(0U, &X86::VR128RegClass);
35849 if (VConstraint && Subtarget.hasVLX())
35850 return std::make_pair(0U, &X86::VR256XRegClass);
35851 return std::make_pair(0U, &X86::VR256RegClass);
35856 return std::make_pair(0U, &X86::VR512RegClass);
35860 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35861 switch (Constraint[1]) {
35865 // This register class doesn't allocate k0 for masked vector operation.
35866 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35867 switch (VT.SimpleTy) {
35870 return std::make_pair(0U, &X86::VK32WMRegClass);
35872 return std::make_pair(0U, &X86::VK16WMRegClass);
35874 return std::make_pair(0U, &X86::VK8WMRegClass);
35876 return std::make_pair(0U, &X86::VK1WMRegClass);
35878 return std::make_pair(0U, &X86::VK64WMRegClass);
35885 // Use the default implementation in TargetLowering to convert the register
35886 // constraint into a member of a register class.
35887 std::pair<unsigned, const TargetRegisterClass*> Res;
35888 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35890 // Not found as a standard register?
35892 // Map st(0) -> st(7) -> ST0
35893 if (Constraint.size() == 7 && Constraint[0] == '{' &&
35894 tolower(Constraint[1]) == 's' &&
35895 tolower(Constraint[2]) == 't' &&
35896 Constraint[3] == '(' &&
35897 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35898 Constraint[5] == ')' &&
35899 Constraint[6] == '}') {
35901 Res.first = X86::FP0+Constraint[4]-'0';
35902 Res.second = &X86::RFP80RegClass;
35906 // GCC allows "st(0)" to be called just plain "st".
35907 if (StringRef("{st}").equals_lower(Constraint)) {
35908 Res.first = X86::FP0;
35909 Res.second = &X86::RFP80RegClass;
35914 if (StringRef("{flags}").equals_lower(Constraint)) {
35915 Res.first = X86::EFLAGS;
35916 Res.second = &X86::CCRRegClass;
35920 // 'A' means [ER]AX + [ER]DX.
35921 if (Constraint == "A") {
35922 if (Subtarget.is64Bit()) {
35923 Res.first = X86::RAX;
35924 Res.second = &X86::GR64_ADRegClass;
35926 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
35927 "Expecting 64, 32 or 16 bit subtarget");
35928 Res.first = X86::EAX;
35929 Res.second = &X86::GR32_ADRegClass;
35936 // Otherwise, check to see if this is a register class of the wrong value
35937 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
35938 // turn into {ax},{dx}.
35939 // MVT::Other is used to specify clobber names.
35940 if (Res.second->hasType(VT) || VT == MVT::Other)
35941 return Res; // Correct type already, nothing to do.
35943 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
35944 // return "eax". This should even work for things like getting 64bit integer
35945 // registers when given an f64 type.
35946 const TargetRegisterClass *Class = Res.second;
35947 // The generic code will match the first register class that contains the
35948 // given register. Thus, based on the ordering of the tablegened file,
35949 // the "plain" GR classes might not come first.
35950 // Therefore, use a helper method.
35951 if (isGRClass(*Class)) {
35952 unsigned Size = VT.getSizeInBits();
35953 if (Size == 1) Size = 8;
35954 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
35956 Res.first = DestReg;
35957 Res.second = Size == 8 ? &X86::GR8RegClass
35958 : Size == 16 ? &X86::GR16RegClass
35959 : Size == 32 ? &X86::GR32RegClass
35960 : &X86::GR64RegClass;
35961 assert(Res.second->contains(Res.first) && "Register in register class");
35963 // No register found/type mismatch.
35965 Res.second = nullptr;
35967 } else if (isFRClass(*Class)) {
35968 // Handle references to XMM physical registers that got mapped into the
35969 // wrong class. This can happen with constraints like {xmm0} where the
35970 // target independent register mapper will just pick the first match it can
35971 // find, ignoring the required type.
35973 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35974 if (VT == MVT::f32 || VT == MVT::i32)
35975 Res.second = &X86::FR32RegClass;
35976 else if (VT == MVT::f64 || VT == MVT::i64)
35977 Res.second = &X86::FR64RegClass;
35978 else if (X86::VR128RegClass.hasType(VT))
35979 Res.second = &X86::VR128RegClass;
35980 else if (X86::VR256RegClass.hasType(VT))
35981 Res.second = &X86::VR256RegClass;
35982 else if (X86::VR512RegClass.hasType(VT))
35983 Res.second = &X86::VR512RegClass;
35985 // Type mismatch and not a clobber: Return an error;
35987 Res.second = nullptr;
35994 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
35995 const AddrMode &AM, Type *Ty,
35996 unsigned AS) const {
35997 // Scaling factors are not free at all.
35998 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
35999 // will take 2 allocations in the out of order engine instead of 1
36000 // for plain addressing mode, i.e. inst (reg1).
36002 // vaddps (%rsi,%drx), %ymm0, %ymm1
36003 // Requires two allocations (one for the load, one for the computation)
36005 // vaddps (%rsi), %ymm0, %ymm1
36006 // Requires just 1 allocation, i.e., freeing allocations for other operations
36007 // and having less micro operations to execute.
36009 // For some X86 architectures, this is even worse because for instance for
36010 // stores, the complex addressing mode forces the instruction to use the
36011 // "load" ports instead of the dedicated "store" port.
36012 // E.g., on Haswell:
36013 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36014 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36015 if (isLegalAddressingMode(DL, AM, Ty, AS))
36016 // Scale represents reg2 * scale, thus account for 1
36017 // as soon as we use a second register.
36018 return AM.Scale != 0;
36022 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36023 // Integer division on x86 is expensive. However, when aggressively optimizing
36024 // for code size, we prefer to use a div instruction, as it is usually smaller
36025 // than the alternative sequence.
36026 // The exception to this is vector division. Since x86 doesn't have vector
36027 // integer division, leaving the division as-is is a loss even in terms of
36028 // size, because it will have to be scalarized, while the alternative code
36029 // sequence can be performed in vector form.
36031 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36032 return OptSize && !VT.isVector();
36035 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36036 if (!Subtarget.is64Bit())
36039 // Update IsSplitCSR in X86MachineFunctionInfo.
36040 X86MachineFunctionInfo *AFI =
36041 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36042 AFI->setIsSplitCSR(true);
36045 void X86TargetLowering::insertCopiesSplitCSR(
36046 MachineBasicBlock *Entry,
36047 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36048 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36049 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36053 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36054 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36055 MachineBasicBlock::iterator MBBI = Entry->begin();
36056 for (const MCPhysReg *I = IStart; *I; ++I) {
36057 const TargetRegisterClass *RC = nullptr;
36058 if (X86::GR64RegClass.contains(*I))
36059 RC = &X86::GR64RegClass;
36061 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36063 unsigned NewVR = MRI->createVirtualRegister(RC);
36064 // Create copy from CSR to a virtual register.
36065 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36066 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36067 // nounwind. If we want to generalize this later, we may need to emit
36068 // CFI pseudo-instructions.
36069 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36070 Attribute::NoUnwind) &&
36071 "Function should be nounwind in insertCopiesSplitCSR!");
36072 Entry->addLiveIn(*I);
36073 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36076 // Insert the copy-back instructions right before the terminator.
36077 for (auto *Exit : Exits)
36078 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36079 TII->get(TargetOpcode::COPY), *I)
36084 bool X86TargetLowering::supportSwiftError() const {
36085 return Subtarget.is64Bit();