1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
223 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
225 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
228 if (!Subtarget.useSoftFloat()) {
229 // SSE has no i16 to fp conversion, only i32.
230 if (X86ScalarSSEf32) {
231 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
232 // f32 and f64 cases are Legal, f80 case is not
233 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
239 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
243 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
245 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
246 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
248 if (!Subtarget.useSoftFloat()) {
249 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
250 // are Legal, f80 is custom lowered.
251 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
252 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
254 if (X86ScalarSSEf32) {
255 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
256 // f32 and f64 cases are Legal, f80 case is not
257 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
260 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
265 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
268 // Handle FP_TO_UINT by promoting the destination to a larger signed
270 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
272 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
274 if (Subtarget.is64Bit()) {
275 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
276 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
283 } else if (!Subtarget.useSoftFloat()) {
284 // Since AVX is a superset of SSE3, only check for SSE here.
285 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
286 // Expand FP_TO_UINT into a select.
287 // FIXME: We would like to use a Custom expander here eventually to do
288 // the optimal thing for SSE vs. the default expansion in the legalizer.
289 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
291 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
292 // With SSE3 we can use fisttpll to convert to a signed i64; without
293 // SSE, we're stuck with a fistpll.
294 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
296 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
299 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
300 if (!X86ScalarSSEf64) {
301 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
302 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
305 // Without SSE, i64->f64 goes through memory.
306 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
308 } else if (!Subtarget.is64Bit())
309 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
311 // Scalar integer divide and remainder are lowered to use operations that
312 // produce two results, to match the available instructions. This exposes
313 // the two-result form to trivial CSE, which is able to combine x/y and x%y
314 // into a single instruction.
316 // Scalar integer multiply-high is also lowered to use two-result
317 // operations, to match the available instructions. However, plain multiply
318 // (low) operations are left as Legal, as there are single-result
319 // instructions for this in x86. Using the two-result multiply instructions
320 // when both high and low results are needed must be arranged by dagcombine.
321 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
322 setOperationAction(ISD::MULHS, VT, Expand);
323 setOperationAction(ISD::MULHU, VT, Expand);
324 setOperationAction(ISD::SDIV, VT, Expand);
325 setOperationAction(ISD::UDIV, VT, Expand);
326 setOperationAction(ISD::SREM, VT, Expand);
327 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
387 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
390 // There's never any support for operations beyond MVT::f32.
391 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
392 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
393 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
403 if (Subtarget.hasPOPCNT()) {
404 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
406 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
407 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
409 if (Subtarget.is64Bit())
410 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
482 if (Subtarget.hasCmpxchg16b()) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
486 // FIXME - use subtarget debug flags
487 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
488 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
490 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
493 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499 setOperationAction(ISD::TRAP, MVT::Other, Legal);
500 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503 setOperationAction(ISD::VASTART , MVT::Other, Custom);
504 setOperationAction(ISD::VAEND , MVT::Other, Expand);
505 bool Is64Bit = Subtarget.is64Bit();
506 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
510 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
512 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
514 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
516 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
518 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
519 // f32 and f64 use SSE.
520 // Set up the FP register classes.
521 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
522 : &X86::FR32RegClass);
523 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
524 : &X86::FR64RegClass);
526 for (auto VT : { MVT::f32, MVT::f64 }) {
527 // Use ANDPD to simulate FABS.
528 setOperationAction(ISD::FABS, VT, Custom);
530 // Use XORP to simulate FNEG.
531 setOperationAction(ISD::FNEG, VT, Custom);
533 // Use ANDPD and ORPD to simulate FCOPYSIGN.
534 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
536 // We don't support sin/cos/fmod
537 setOperationAction(ISD::FSIN , VT, Expand);
538 setOperationAction(ISD::FCOS , VT, Expand);
539 setOperationAction(ISD::FSINCOS, VT, Expand);
542 // Lower this to MOVMSK plus an AND.
543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
546 // Expand FP immediates into loads from the stack, except for the special
548 addLegalFPImmediate(APFloat(+0.0)); // xorpd
549 addLegalFPImmediate(APFloat(+0.0f)); // xorps
550 } else if (UseX87 && X86ScalarSSEf32) {
551 // Use SSE for f32, x87 for f64.
552 // Set up the FP register classes.
553 addRegisterClass(MVT::f32, &X86::FR32RegClass);
554 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
556 // Use ANDPS to simulate FABS.
557 setOperationAction(ISD::FABS , MVT::f32, Custom);
559 // Use XORP to simulate FNEG.
560 setOperationAction(ISD::FNEG , MVT::f32, Custom);
562 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
564 // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
568 // We don't support sin/cos/fmod
569 setOperationAction(ISD::FSIN , MVT::f32, Expand);
570 setOperationAction(ISD::FCOS , MVT::f32, Expand);
571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
573 // Special cases we handle for FP constants.
574 addLegalFPImmediate(APFloat(+0.0f)); // xorps
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
580 // Always expand sin/cos functions even though x87 has an instruction.
581 setOperationAction(ISD::FSIN , MVT::f64, Expand);
582 setOperationAction(ISD::FCOS , MVT::f64, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
585 // f32 and f64 in x87.
586 // Set up the FP register classes.
587 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
588 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 for (auto VT : { MVT::f32, MVT::f64 }) {
591 setOperationAction(ISD::UNDEF, VT, Expand);
592 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 // Always expand sin/cos functions even though x87 has an instruction.
595 setOperationAction(ISD::FSIN , VT, Expand);
596 setOperationAction(ISD::FCOS , VT, Expand);
597 setOperationAction(ISD::FSINCOS, VT, Expand);
599 addLegalFPImmediate(APFloat(+0.0)); // FLD0
600 addLegalFPImmediate(APFloat(+1.0)); // FLD1
601 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
602 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
603 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609 // We don't support FMA.
610 setOperationAction(ISD::FMA, MVT::f64, Expand);
611 setOperationAction(ISD::FMA, MVT::f32, Expand);
613 // Long double always uses X87, except f128 in MMX.
615 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
616 addRegisterClass(MVT::f128, &X86::VR128RegClass);
617 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
618 setOperationAction(ISD::FABS , MVT::f128, Custom);
619 setOperationAction(ISD::FNEG , MVT::f128, Custom);
620 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
623 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
625 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
627 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
628 addLegalFPImmediate(TmpFlt); // FLD0
630 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
633 APFloat TmpFlt2(+1.0);
634 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
636 addLegalFPImmediate(TmpFlt2); // FLD1
637 TmpFlt2.changeSign();
638 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
641 // Always expand sin/cos functions even though x87 has an instruction.
642 setOperationAction(ISD::FSIN , MVT::f80, Expand);
643 setOperationAction(ISD::FCOS , MVT::f80, Expand);
644 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
646 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
647 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
648 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
649 setOperationAction(ISD::FRINT, MVT::f80, Expand);
650 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
651 setOperationAction(ISD::FMA, MVT::f80, Expand);
654 // Always use a library call for pow.
655 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
657 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
659 setOperationAction(ISD::FLOG, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
661 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP, MVT::f80, Expand);
663 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
664 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
665 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
667 // Some FP actions are always expanded for vector types.
668 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
669 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
670 setOperationAction(ISD::FSIN, VT, Expand);
671 setOperationAction(ISD::FSINCOS, VT, Expand);
672 setOperationAction(ISD::FCOS, VT, Expand);
673 setOperationAction(ISD::FREM, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
796 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
797 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
798 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
799 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
806 // Provide custom widening for v2f32 setcc. This is really for VLX when
807 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
808 // type legalization changing the result type to v4i1 during widening.
809 // It works fine for SSE2 and is probably faster so no need to qualify with
811 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
813 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
814 setOperationAction(ISD::SETCC, VT, Custom);
815 setOperationAction(ISD::CTPOP, VT, Custom);
816 setOperationAction(ISD::CTTZ, VT, Custom);
818 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
819 // setcc all the way to isel and prefer SETGT in some isel patterns.
820 setCondCodeAction(ISD::SETLT, VT, Custom);
821 setCondCodeAction(ISD::SETLE, VT, Custom);
824 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
825 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
826 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
827 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
828 setOperationAction(ISD::VSELECT, VT, Custom);
829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
832 // We support custom legalizing of sext and anyext loads for specific
833 // memory vector types which we can load as a scalar (or sequence of
834 // scalars) and extend in-register to a legal 128-bit vector type. For sext
835 // loads these must work with a single scalar load.
836 for (MVT VT : MVT::integer_vector_valuetypes()) {
837 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
838 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
839 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
840 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
841 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
842 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
843 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
844 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
845 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
848 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
850 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
851 setOperationAction(ISD::VSELECT, VT, Custom);
853 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
856 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
860 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
861 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
862 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
863 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
864 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
865 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
866 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
869 // Custom lower v2i64 and v2f64 selects.
870 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
871 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
873 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
874 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
876 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
877 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
879 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
881 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
882 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
884 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
885 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
887 for (MVT VT : MVT::fp_vector_valuetypes())
888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
890 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
891 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
892 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
893 if (!Subtarget.hasAVX512())
894 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
896 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
897 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
898 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
900 // In the customized shift lowering, the legal v4i32/v2i64 cases
901 // in AVX2 will be recognized.
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SRL, VT, Custom);
904 setOperationAction(ISD::SHL, VT, Custom);
905 setOperationAction(ISD::SRA, VT, Custom);
908 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
909 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
910 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
913 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
914 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
915 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
916 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
917 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
918 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
919 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
920 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
921 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
924 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
925 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
926 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
927 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
928 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
929 setOperationAction(ISD::FRINT, RoundedTy, Legal);
930 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
933 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
934 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
935 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
936 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
937 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
938 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
939 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
940 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
942 // FIXME: Do we need to handle scalar-to-vector here?
943 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
945 // We directly match byte blends in the backend as they match the VSELECT
947 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
949 // SSE41 brings specific instructions for doing vector sign extend even in
950 // cases where we don't have SRA.
951 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
952 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
953 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
956 for (MVT VT : MVT::integer_vector_valuetypes()) {
957 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
958 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
959 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
962 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
963 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
964 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
965 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
966 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
967 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
968 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
969 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
970 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
973 // i8 vectors are custom because the source register and source
974 // source memory operand types are not the same width.
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
978 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
979 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
980 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
981 setOperationAction(ISD::ROTL, VT, Custom);
983 // XOP can efficiently perform BITREVERSE with VPPERM.
984 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
985 setOperationAction(ISD::BITREVERSE, VT, Custom);
987 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
988 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
989 setOperationAction(ISD::BITREVERSE, VT, Custom);
992 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
993 bool HasInt256 = Subtarget.hasInt256();
995 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1001 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1002 : &X86::VR256RegClass);
1003 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1004 : &X86::VR256RegClass);
1005 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1006 : &X86::VR256RegClass);
1008 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1009 setOperationAction(ISD::FFLOOR, VT, Legal);
1010 setOperationAction(ISD::FCEIL, VT, Legal);
1011 setOperationAction(ISD::FTRUNC, VT, Legal);
1012 setOperationAction(ISD::FRINT, VT, Legal);
1013 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1014 setOperationAction(ISD::FNEG, VT, Custom);
1015 setOperationAction(ISD::FABS, VT, Custom);
1016 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1019 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1020 // even though v8i16 is a legal type.
1021 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1022 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1025 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1026 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1028 if (!Subtarget.hasAVX512())
1029 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1031 for (MVT VT : MVT::fp_vector_valuetypes())
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1034 // In the customized shift lowering, the legal v8i32/v4i64 cases
1035 // in AVX2 will be recognized.
1036 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037 setOperationAction(ISD::SRL, VT, Custom);
1038 setOperationAction(ISD::SHL, VT, Custom);
1039 setOperationAction(ISD::SRA, VT, Custom);
1042 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1043 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1044 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1046 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1047 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1048 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1050 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1051 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1052 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1053 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1056 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1057 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1058 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1059 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1061 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1062 setOperationAction(ISD::SETCC, VT, Custom);
1063 setOperationAction(ISD::CTPOP, VT, Custom);
1064 setOperationAction(ISD::CTTZ, VT, Custom);
1065 setOperationAction(ISD::CTLZ, VT, Custom);
1067 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1068 // setcc all the way to isel and prefer SETGT in some isel patterns.
1069 setCondCodeAction(ISD::SETLT, VT, Custom);
1070 setCondCodeAction(ISD::SETLE, VT, Custom);
1073 if (Subtarget.hasAnyFMA()) {
1074 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1075 MVT::v2f64, MVT::v4f64 })
1076 setOperationAction(ISD::FMA, VT, Legal);
1079 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1080 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1081 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1085 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1086 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1087 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1089 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1090 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1092 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1094 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1095 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1097 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1098 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1099 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1100 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1102 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1103 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1104 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1105 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1106 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1107 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1111 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1112 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1113 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1115 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1116 // when we have a 256bit-wide blend with immediate.
1117 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1119 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1120 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1121 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1122 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1123 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1124 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1125 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1126 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1130 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1131 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1132 setOperationAction(ISD::MLOAD, VT, Legal);
1133 setOperationAction(ISD::MSTORE, VT, Legal);
1136 // Extract subvector is special because the value type
1137 // (result) is 128-bit but the source is 256-bit wide.
1138 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1139 MVT::v4f32, MVT::v2f64 }) {
1140 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1143 // Custom lower several nodes for 256-bit types.
1144 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1145 MVT::v8f32, MVT::v4f64 }) {
1146 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1147 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1148 setOperationAction(ISD::VSELECT, VT, Custom);
1149 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1151 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1152 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1153 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1157 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1159 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1160 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1161 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1165 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1169 // Custom legalize 2x32 to get a little better code.
1170 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1171 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1175 setOperationAction(ISD::MGATHER, VT, Custom);
1179 // This block controls legalization of the mask vector sizes that are
1180 // available with AVX512. 512-bit vectors are in a separate block controlled
1181 // by useAVX512Regs.
1182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1183 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1184 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1185 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1186 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1187 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1189 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1190 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1191 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1193 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1194 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1195 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1196 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1197 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1200 // There is no byte sized k-register load or store without AVX512DQ.
1201 if (!Subtarget.hasDQI()) {
1202 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1205 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1207 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1208 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1209 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1210 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1213 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1214 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1215 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1216 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1217 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1220 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1221 setOperationAction(ISD::ADD, VT, Custom);
1222 setOperationAction(ISD::SUB, VT, Custom);
1223 setOperationAction(ISD::MUL, VT, Custom);
1224 setOperationAction(ISD::SETCC, VT, Custom);
1225 setOperationAction(ISD::SELECT, VT, Custom);
1226 setOperationAction(ISD::TRUNCATE, VT, Custom);
1228 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1229 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1230 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1231 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1232 setOperationAction(ISD::VSELECT, VT, Expand);
1235 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1236 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1237 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1238 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1239 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1240 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1241 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1242 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1243 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1246 // This block controls legalization for 512-bit operations with 32/64 bit
1247 // elements. 512-bits can be disabled based on prefer-vector-width and
1248 // required-vector-width function attributes.
1249 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1250 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1251 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1252 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1253 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1255 for (MVT VT : MVT::fp_vector_valuetypes())
1256 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1258 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1259 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1260 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1261 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1262 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1263 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1266 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FNEG, VT, Custom);
1268 setOperationAction(ISD::FABS, VT, Custom);
1269 setOperationAction(ISD::FMA, VT, Legal);
1270 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1273 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1274 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1275 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1276 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1278 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1279 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1280 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1281 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1282 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1284 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1285 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1286 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1287 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1288 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1290 if (!Subtarget.hasVLX()) {
1291 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1292 // to 512-bit rather than use the AVX2 instructions so that we can use
1294 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1295 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1296 setOperationAction(ISD::MLOAD, VT, Custom);
1297 setOperationAction(ISD::MSTORE, VT, Custom);
1301 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1302 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1304 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1305 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1310 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1311 setOperationAction(ISD::FFLOOR, VT, Legal);
1312 setOperationAction(ISD::FCEIL, VT, Legal);
1313 setOperationAction(ISD::FTRUNC, VT, Legal);
1314 setOperationAction(ISD::FRINT, VT, Legal);
1315 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1318 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1319 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1321 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1323 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1327 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1330 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1331 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1333 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1334 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1336 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1337 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1338 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1340 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1341 setOperationAction(ISD::SMAX, VT, Legal);
1342 setOperationAction(ISD::UMAX, VT, Legal);
1343 setOperationAction(ISD::SMIN, VT, Legal);
1344 setOperationAction(ISD::UMIN, VT, Legal);
1345 setOperationAction(ISD::ABS, VT, Legal);
1346 setOperationAction(ISD::SRL, VT, Custom);
1347 setOperationAction(ISD::SHL, VT, Custom);
1348 setOperationAction(ISD::SRA, VT, Custom);
1349 setOperationAction(ISD::CTPOP, VT, Custom);
1350 setOperationAction(ISD::CTTZ, VT, Custom);
1351 setOperationAction(ISD::ROTL, VT, Custom);
1352 setOperationAction(ISD::ROTR, VT, Custom);
1353 setOperationAction(ISD::SETCC, VT, Custom);
1355 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1356 // setcc all the way to isel and prefer SETGT in some isel patterns.
1357 setCondCodeAction(ISD::SETLT, VT, Custom);
1358 setCondCodeAction(ISD::SETLE, VT, Custom);
1361 // Need to promote to 64-bit even though we have 32-bit masked instructions
1362 // because the IR optimizers rearrange bitcasts around logic ops leaving
1363 // too many variations to handle if we don't promote them.
1364 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1365 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1366 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1368 if (Subtarget.hasDQI()) {
1369 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1370 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1371 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1377 if (Subtarget.hasCDI()) {
1378 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1379 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1380 setOperationAction(ISD::CTLZ, VT, Legal);
1381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1383 } // Subtarget.hasCDI()
1385 if (Subtarget.hasVPOPCNTDQ()) {
1386 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1387 setOperationAction(ISD::CTPOP, VT, Legal);
1390 // Extract subvector is special because the value type
1391 // (result) is 256-bit but the source is 512-bit wide.
1392 // 128-bit was made Legal under AVX1.
1393 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1394 MVT::v8f32, MVT::v4f64 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Custom);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1415 // Need to custom split v32i16/v64i8 bitcasts.
1416 if (!Subtarget.hasBWI()) {
1417 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1418 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1422 // This block controls legalization for operations that don't have
1423 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1425 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1426 // These operations are handled on non-VLX by artificially widening in
1428 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1430 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1431 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1432 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1433 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1434 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1436 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1437 setOperationAction(ISD::SMAX, VT, Legal);
1438 setOperationAction(ISD::UMAX, VT, Legal);
1439 setOperationAction(ISD::SMIN, VT, Legal);
1440 setOperationAction(ISD::UMIN, VT, Legal);
1441 setOperationAction(ISD::ABS, VT, Legal);
1444 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1445 setOperationAction(ISD::ROTL, VT, Custom);
1446 setOperationAction(ISD::ROTR, VT, Custom);
1449 // Custom legalize 2x32 to get a little better code.
1450 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1451 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1453 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1454 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1455 setOperationAction(ISD::MSCATTER, VT, Custom);
1457 if (Subtarget.hasDQI()) {
1458 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1459 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1461 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1462 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1464 setOperationAction(ISD::MUL, VT, Legal);
1468 if (Subtarget.hasCDI()) {
1469 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1470 setOperationAction(ISD::CTLZ, VT, Legal);
1471 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1473 } // Subtarget.hasCDI()
1475 if (Subtarget.hasVPOPCNTDQ()) {
1476 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1477 setOperationAction(ISD::CTPOP, VT, Legal);
1481 // This block control legalization of v32i1/v64i1 which are available with
1482 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1484 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1489 setOperationAction(ISD::ADD, VT, Custom);
1490 setOperationAction(ISD::SUB, VT, Custom);
1491 setOperationAction(ISD::MUL, VT, Custom);
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1494 setOperationAction(ISD::TRUNCATE, VT, Custom);
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1497 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1508 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1510 // Extends from v32i1 masks to 256-bit vectors.
1511 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1512 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1513 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1516 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1517 // disabled based on prefer-vector-width and required-vector-width function
1519 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1520 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1523 // Extends from v64i1 masks to 512-bit vectors.
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1528 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1529 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1530 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1531 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1532 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1533 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1534 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1535 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1536 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1537 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1539 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1540 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1541 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1542 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1543 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1544 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1545 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1548 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1549 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1550 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1556 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1557 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1558 setOperationAction(ISD::VSELECT, VT, Custom);
1559 setOperationAction(ISD::ABS, VT, Legal);
1560 setOperationAction(ISD::SRL, VT, Custom);
1561 setOperationAction(ISD::SHL, VT, Custom);
1562 setOperationAction(ISD::SRA, VT, Custom);
1563 setOperationAction(ISD::MLOAD, VT, Legal);
1564 setOperationAction(ISD::MSTORE, VT, Legal);
1565 setOperationAction(ISD::CTPOP, VT, Custom);
1566 setOperationAction(ISD::CTTZ, VT, Custom);
1567 setOperationAction(ISD::CTLZ, VT, Custom);
1568 setOperationAction(ISD::SMAX, VT, Legal);
1569 setOperationAction(ISD::UMAX, VT, Legal);
1570 setOperationAction(ISD::SMIN, VT, Legal);
1571 setOperationAction(ISD::UMIN, VT, Legal);
1572 setOperationAction(ISD::SETCC, VT, Custom);
1574 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1575 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1576 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1579 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1580 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1583 if (Subtarget.hasBITALG()) {
1584 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1585 setOperationAction(ISD::CTPOP, VT, Legal);
1589 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1590 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1591 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1592 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1595 // These operations are handled on non-VLX by artificially widening in
1597 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1599 if (Subtarget.hasBITALG()) {
1600 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1601 setOperationAction(ISD::CTPOP, VT, Legal);
1605 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1606 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1607 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1608 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1609 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1610 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1612 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1613 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1614 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1615 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1616 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1618 if (Subtarget.hasDQI()) {
1619 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1620 // v2f32 UINT_TO_FP is already custom under SSE2.
1621 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1622 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1623 "Unexpected operation action!");
1624 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1625 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1626 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1629 if (Subtarget.hasBWI()) {
1630 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1631 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1635 // We want to custom lower some of our intrinsics.
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1637 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1638 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1639 if (!Subtarget.is64Bit()) {
1640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1641 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1644 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1645 // handle type legalization for these operations here.
1647 // FIXME: We really should do custom legalization for addition and
1648 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1649 // than generic legalization for 64-bit multiplication-with-overflow, though.
1650 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1651 if (VT == MVT::i64 && !Subtarget.is64Bit())
1653 // Add/Sub/Mul with overflow operations are custom lowered.
1654 setOperationAction(ISD::SADDO, VT, Custom);
1655 setOperationAction(ISD::UADDO, VT, Custom);
1656 setOperationAction(ISD::SSUBO, VT, Custom);
1657 setOperationAction(ISD::USUBO, VT, Custom);
1658 setOperationAction(ISD::SMULO, VT, Custom);
1659 setOperationAction(ISD::UMULO, VT, Custom);
1661 // Support carry in as value rather than glue.
1662 setOperationAction(ISD::ADDCARRY, VT, Custom);
1663 setOperationAction(ISD::SUBCARRY, VT, Custom);
1664 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1667 if (!Subtarget.is64Bit()) {
1668 // These libcalls are not available in 32-bit.
1669 setLibcallName(RTLIB::SHL_I128, nullptr);
1670 setLibcallName(RTLIB::SRL_I128, nullptr);
1671 setLibcallName(RTLIB::SRA_I128, nullptr);
1672 setLibcallName(RTLIB::MUL_I128, nullptr);
1675 // Combine sin / cos into _sincos_stret if it is available.
1676 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1677 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1678 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1679 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1682 if (Subtarget.isTargetWin64()) {
1683 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1684 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1685 setOperationAction(ISD::SREM, MVT::i128, Custom);
1686 setOperationAction(ISD::UREM, MVT::i128, Custom);
1687 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1688 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1691 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1692 // is. We should promote the value to 64-bits to solve this.
1693 // This is what the CRT headers do - `fmodf` is an inline header
1694 // function casting to f64 and calling `fmod`.
1695 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1696 Subtarget.isTargetWindowsItanium()))
1697 for (ISD::NodeType Op :
1698 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1699 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1700 if (isOperationExpand(Op, MVT::f32))
1701 setOperationAction(Op, MVT::f32, Promote);
1703 // We have target-specific dag combine patterns for the following nodes:
1704 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1705 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1707 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1708 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1709 setTargetDAGCombine(ISD::BITCAST);
1710 setTargetDAGCombine(ISD::VSELECT);
1711 setTargetDAGCombine(ISD::SELECT);
1712 setTargetDAGCombine(ISD::SHL);
1713 setTargetDAGCombine(ISD::SRA);
1714 setTargetDAGCombine(ISD::SRL);
1715 setTargetDAGCombine(ISD::OR);
1716 setTargetDAGCombine(ISD::AND);
1717 setTargetDAGCombine(ISD::ADD);
1718 setTargetDAGCombine(ISD::FADD);
1719 setTargetDAGCombine(ISD::FSUB);
1720 setTargetDAGCombine(ISD::FNEG);
1721 setTargetDAGCombine(ISD::FMA);
1722 setTargetDAGCombine(ISD::FMINNUM);
1723 setTargetDAGCombine(ISD::FMAXNUM);
1724 setTargetDAGCombine(ISD::SUB);
1725 setTargetDAGCombine(ISD::LOAD);
1726 setTargetDAGCombine(ISD::MLOAD);
1727 setTargetDAGCombine(ISD::STORE);
1728 setTargetDAGCombine(ISD::MSTORE);
1729 setTargetDAGCombine(ISD::TRUNCATE);
1730 setTargetDAGCombine(ISD::ZERO_EXTEND);
1731 setTargetDAGCombine(ISD::ANY_EXTEND);
1732 setTargetDAGCombine(ISD::SIGN_EXTEND);
1733 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1734 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1735 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1736 setTargetDAGCombine(ISD::SINT_TO_FP);
1737 setTargetDAGCombine(ISD::UINT_TO_FP);
1738 setTargetDAGCombine(ISD::SETCC);
1739 setTargetDAGCombine(ISD::MUL);
1740 setTargetDAGCombine(ISD::XOR);
1741 setTargetDAGCombine(ISD::MSCATTER);
1742 setTargetDAGCombine(ISD::MGATHER);
1744 computeRegisterProperties(Subtarget.getRegisterInfo());
1746 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1747 MaxStoresPerMemsetOptSize = 8;
1748 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1749 MaxStoresPerMemcpyOptSize = 4;
1750 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1751 MaxStoresPerMemmoveOptSize = 4;
1753 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1754 // that needs to benchmarked and balanced with the potential use of vector
1755 // load/store types (PR33329, PR33914).
1756 MaxLoadsPerMemcmp = 2;
1757 MaxLoadsPerMemcmpOptSize = 2;
1759 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1760 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1762 // An out-of-order CPU can speculatively execute past a predictable branch,
1763 // but a conditional move could be stalled by an expensive earlier operation.
1764 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1765 EnableExtLdPromotion = true;
1766 setPrefFunctionAlignment(4); // 2^4 bytes.
1768 verifyIntrinsicTables();
1771 // This has so far only been implemented for 64-bit MachO.
1772 bool X86TargetLowering::useLoadStackGuardNode() const {
1773 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1776 bool X86TargetLowering::useStackGuardXorFP() const {
1777 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1778 return Subtarget.getTargetTriple().isOSMSVCRT();
1781 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1782 const SDLoc &DL) const {
1783 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1784 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1785 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1786 return SDValue(Node, 0);
1789 TargetLoweringBase::LegalizeTypeAction
1790 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1791 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1792 return TypeSplitVector;
1794 if (ExperimentalVectorWideningLegalization &&
1795 VT.getVectorNumElements() != 1 &&
1796 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1797 return TypeWidenVector;
1799 return TargetLoweringBase::getPreferredVectorAction(VT);
1802 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1805 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1807 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1810 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1813 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1815 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1818 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1819 LLVMContext& Context,
1824 if (Subtarget.hasAVX512()) {
1825 const unsigned NumElts = VT.getVectorNumElements();
1827 // Figure out what this type will be legalized to.
1829 while (getTypeAction(Context, LegalVT) != TypeLegal)
1830 LegalVT = getTypeToTransformTo(Context, LegalVT);
1832 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1833 if (LegalVT.getSimpleVT().is512BitVector())
1834 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1836 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1837 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1838 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1840 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1841 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1842 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1846 return VT.changeVectorElementTypeToInteger();
1849 /// Helper for getByValTypeAlignment to determine
1850 /// the desired ByVal argument alignment.
1851 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1854 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1855 if (VTy->getBitWidth() == 128)
1857 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1858 unsigned EltAlign = 0;
1859 getMaxByValAlign(ATy->getElementType(), EltAlign);
1860 if (EltAlign > MaxAlign)
1861 MaxAlign = EltAlign;
1862 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1863 for (auto *EltTy : STy->elements()) {
1864 unsigned EltAlign = 0;
1865 getMaxByValAlign(EltTy, EltAlign);
1866 if (EltAlign > MaxAlign)
1867 MaxAlign = EltAlign;
1874 /// Return the desired alignment for ByVal aggregate
1875 /// function arguments in the caller parameter area. For X86, aggregates
1876 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1877 /// are at 4-byte boundaries.
1878 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1879 const DataLayout &DL) const {
1880 if (Subtarget.is64Bit()) {
1881 // Max of 8 and alignment of type.
1882 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1889 if (Subtarget.hasSSE1())
1890 getMaxByValAlign(Ty, Align);
1894 /// Returns the target specific optimal type for load
1895 /// and store operations as a result of memset, memcpy, and memmove
1896 /// lowering. If DstAlign is zero that means it's safe to destination
1897 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1898 /// means there isn't a need to check it against alignment requirement,
1899 /// probably because the source does not need to be loaded. If 'IsMemset' is
1900 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1901 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1902 /// source is constant so it does not need to be loaded.
1903 /// It returns EVT::Other if the type should be determined using generic
1904 /// target-independent logic.
1906 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1907 unsigned DstAlign, unsigned SrcAlign,
1908 bool IsMemset, bool ZeroMemset,
1910 MachineFunction &MF) const {
1911 const Function &F = MF.getFunction();
1912 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1914 (!Subtarget.isUnalignedMem16Slow() ||
1915 ((DstAlign == 0 || DstAlign >= 16) &&
1916 (SrcAlign == 0 || SrcAlign >= 16)))) {
1917 // FIXME: Check if unaligned 32-byte accesses are slow.
1918 if (Size >= 32 && Subtarget.hasAVX()) {
1919 // Although this isn't a well-supported type for AVX1, we'll let
1920 // legalization and shuffle lowering produce the optimal codegen. If we
1921 // choose an optimal type with a vector element larger than a byte,
1922 // getMemsetStores() may create an intermediate splat (using an integer
1923 // multiply) before we splat as a vector.
1926 if (Subtarget.hasSSE2())
1928 // TODO: Can SSE1 handle a byte vector?
1929 if (Subtarget.hasSSE1())
1931 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1932 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1933 // Do not use f64 to lower memcpy if source is string constant. It's
1934 // better to use i32 to avoid the loads.
1935 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1936 // The gymnastics of splatting a byte value into an XMM register and then
1937 // only using 8-byte stores (because this is a CPU with slow unaligned
1938 // 16-byte accesses) makes that a loser.
1942 // This is a compromise. If we reach here, unaligned accesses may be slow on
1943 // this target. However, creating smaller, aligned accesses could be even
1944 // slower and would certainly be a lot more code.
1945 if (Subtarget.is64Bit() && Size >= 8)
1950 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1952 return X86ScalarSSEf32;
1953 else if (VT == MVT::f64)
1954 return X86ScalarSSEf64;
1959 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1964 switch (VT.getSizeInBits()) {
1966 // 8-byte and under are always assumed to be fast.
1970 *Fast = !Subtarget.isUnalignedMem16Slow();
1973 *Fast = !Subtarget.isUnalignedMem32Slow();
1975 // TODO: What about AVX-512 (512-bit) accesses?
1978 // Misaligned accesses of any size are always allowed.
1982 /// Return the entry encoding for a jump table in the
1983 /// current function. The returned value is a member of the
1984 /// MachineJumpTableInfo::JTEntryKind enum.
1985 unsigned X86TargetLowering::getJumpTableEncoding() const {
1986 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1988 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1989 return MachineJumpTableInfo::EK_Custom32;
1991 // Otherwise, use the normal jump table encoding heuristics.
1992 return TargetLowering::getJumpTableEncoding();
1995 bool X86TargetLowering::useSoftFloat() const {
1996 return Subtarget.useSoftFloat();
1999 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2000 ArgListTy &Args) const {
2002 // Only relabel X86-32 for C / Stdcall CCs.
2003 if (Subtarget.is64Bit())
2005 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2007 unsigned ParamRegs = 0;
2008 if (auto *M = MF->getFunction().getParent())
2009 ParamRegs = M->getNumberRegisterParameters();
2011 // Mark the first N int arguments as having reg
2012 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2013 Type *T = Args[Idx].Ty;
2014 if (T->isIntOrPtrTy())
2015 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2016 unsigned numRegs = 1;
2017 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2019 if (ParamRegs < numRegs)
2021 ParamRegs -= numRegs;
2022 Args[Idx].IsInReg = true;
2028 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2029 const MachineBasicBlock *MBB,
2030 unsigned uid,MCContext &Ctx) const{
2031 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2032 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2034 return MCSymbolRefExpr::create(MBB->getSymbol(),
2035 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2038 /// Returns relocation base for the given PIC jumptable.
2039 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2040 SelectionDAG &DAG) const {
2041 if (!Subtarget.is64Bit())
2042 // This doesn't have SDLoc associated with it, but is not really the
2043 // same as a Register.
2044 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2045 getPointerTy(DAG.getDataLayout()));
2049 /// This returns the relocation base for the given PIC jumptable,
2050 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2051 const MCExpr *X86TargetLowering::
2052 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2053 MCContext &Ctx) const {
2054 // X86-64 uses RIP relative addressing based on the jump table label.
2055 if (Subtarget.isPICStyleRIPRel())
2056 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2058 // Otherwise, the reference is relative to the PIC base.
2059 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2062 std::pair<const TargetRegisterClass *, uint8_t>
2063 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2065 const TargetRegisterClass *RRC = nullptr;
2067 switch (VT.SimpleTy) {
2069 return TargetLowering::findRepresentativeClass(TRI, VT);
2070 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2071 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2074 RRC = &X86::VR64RegClass;
2076 case MVT::f32: case MVT::f64:
2077 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2078 case MVT::v4f32: case MVT::v2f64:
2079 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2080 case MVT::v8f32: case MVT::v4f64:
2081 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2082 case MVT::v16f32: case MVT::v8f64:
2083 RRC = &X86::VR128XRegClass;
2086 return std::make_pair(RRC, Cost);
2089 unsigned X86TargetLowering::getAddressSpace() const {
2090 if (Subtarget.is64Bit())
2091 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2095 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2096 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2097 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2100 static Constant* SegmentOffset(IRBuilder<> &IRB,
2101 unsigned Offset, unsigned AddressSpace) {
2102 return ConstantExpr::getIntToPtr(
2103 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2104 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2107 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2108 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2109 // tcbhead_t; use it instead of the usual global variable (see
2110 // sysdeps/{i386,x86_64}/nptl/tls.h)
2111 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2112 if (Subtarget.isTargetFuchsia()) {
2113 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2114 return SegmentOffset(IRB, 0x10, getAddressSpace());
2116 // %fs:0x28, unless we're using a Kernel code model, in which case
2117 // it's %gs:0x28. gs:0x14 on i386.
2118 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2119 return SegmentOffset(IRB, Offset, getAddressSpace());
2123 return TargetLowering::getIRStackGuard(IRB);
2126 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2127 // MSVC CRT provides functionalities for stack protection.
2128 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2129 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2130 // MSVC CRT has a global variable holding security cookie.
2131 M.getOrInsertGlobal("__security_cookie",
2132 Type::getInt8PtrTy(M.getContext()));
2134 // MSVC CRT has a function to validate security cookie.
2135 auto *SecurityCheckCookie = cast<Function>(
2136 M.getOrInsertFunction("__security_check_cookie",
2137 Type::getVoidTy(M.getContext()),
2138 Type::getInt8PtrTy(M.getContext())));
2139 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2140 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2143 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2144 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2146 TargetLowering::insertSSPDeclarations(M);
2149 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2150 // MSVC CRT has a global variable holding security cookie.
2151 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2152 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2153 return M.getGlobalVariable("__security_cookie");
2155 return TargetLowering::getSDagStackGuard(M);
2158 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2159 // MSVC CRT has a function to validate security cookie.
2160 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2161 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2162 return M.getFunction("__security_check_cookie");
2164 return TargetLowering::getSSPStackGuardCheck(M);
2167 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2168 if (Subtarget.getTargetTriple().isOSContiki())
2169 return getDefaultSafeStackPointerLocation(IRB, false);
2171 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2172 // definition of TLS_SLOT_SAFESTACK in
2173 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2174 if (Subtarget.isTargetAndroid()) {
2175 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2177 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2178 return SegmentOffset(IRB, Offset, getAddressSpace());
2181 // Fuchsia is similar.
2182 if (Subtarget.isTargetFuchsia()) {
2183 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2184 return SegmentOffset(IRB, 0x18, getAddressSpace());
2187 return TargetLowering::getSafeStackPointerLocation(IRB);
2190 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2191 unsigned DestAS) const {
2192 assert(SrcAS != DestAS && "Expected different address spaces!");
2194 return SrcAS < 256 && DestAS < 256;
2197 //===----------------------------------------------------------------------===//
2198 // Return Value Calling Convention Implementation
2199 //===----------------------------------------------------------------------===//
2201 #include "X86GenCallingConv.inc"
2203 bool X86TargetLowering::CanLowerReturn(
2204 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2205 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2206 SmallVector<CCValAssign, 16> RVLocs;
2207 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2208 return CCInfo.CheckReturn(Outs, RetCC_X86);
2211 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2212 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2216 /// Lowers masks values (v*i1) to the local register values
2217 /// \returns DAG node after lowering to register type
2218 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2219 const SDLoc &Dl, SelectionDAG &DAG) {
2220 EVT ValVT = ValArg.getValueType();
2222 if (ValVT == MVT::v1i1)
2223 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2224 DAG.getIntPtrConstant(0, Dl));
2226 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2227 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2228 // Two stage lowering might be required
2229 // bitcast: v8i1 -> i8 / v16i1 -> i16
2230 // anyextend: i8 -> i32 / i16 -> i32
2231 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2232 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2233 if (ValLoc == MVT::i32)
2234 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2238 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2239 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2240 // One stage lowering is required
2241 // bitcast: v32i1 -> i32 / v64i1 -> i64
2242 return DAG.getBitcast(ValLoc, ValArg);
2245 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2248 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2249 static void Passv64i1ArgInRegs(
2250 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2251 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2252 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2253 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2254 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2255 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2256 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2257 "The value should reside in two registers");
2259 // Before splitting the value we cast it to i64
2260 Arg = DAG.getBitcast(MVT::i64, Arg);
2262 // Splitting the value into two i32 types
2264 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2265 DAG.getConstant(0, Dl, MVT::i32));
2266 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2267 DAG.getConstant(1, Dl, MVT::i32));
2269 // Attach the two i32 types into corresponding registers
2270 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2271 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2275 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2277 const SmallVectorImpl<ISD::OutputArg> &Outs,
2278 const SmallVectorImpl<SDValue> &OutVals,
2279 const SDLoc &dl, SelectionDAG &DAG) const {
2280 MachineFunction &MF = DAG.getMachineFunction();
2281 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2283 // In some cases we need to disable registers from the default CSR list.
2284 // For example, when they are used for argument passing.
2285 bool ShouldDisableCalleeSavedRegister =
2286 CallConv == CallingConv::X86_RegCall ||
2287 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2289 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2290 report_fatal_error("X86 interrupts may not return any value");
2292 SmallVector<CCValAssign, 16> RVLocs;
2293 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2294 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2297 SmallVector<SDValue, 6> RetOps;
2298 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2299 // Operand #1 = Bytes To Pop
2300 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2303 // Copy the result values into the output registers.
2304 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2306 CCValAssign &VA = RVLocs[I];
2307 assert(VA.isRegLoc() && "Can only return in registers!");
2309 // Add the register to the CalleeSaveDisableRegs list.
2310 if (ShouldDisableCalleeSavedRegister)
2311 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2313 SDValue ValToCopy = OutVals[OutsIndex];
2314 EVT ValVT = ValToCopy.getValueType();
2316 // Promote values to the appropriate types.
2317 if (VA.getLocInfo() == CCValAssign::SExt)
2318 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2319 else if (VA.getLocInfo() == CCValAssign::ZExt)
2320 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2321 else if (VA.getLocInfo() == CCValAssign::AExt) {
2322 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2323 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2325 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2327 else if (VA.getLocInfo() == CCValAssign::BCvt)
2328 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2330 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2331 "Unexpected FP-extend for return value.");
2333 // If this is x86-64, and we disabled SSE, we can't return FP values,
2334 // or SSE or MMX vectors.
2335 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2336 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2337 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2338 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2339 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2340 } else if (ValVT == MVT::f64 &&
2341 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2342 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2343 // llvm-gcc has never done it right and no one has noticed, so this
2344 // should be OK for now.
2345 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2346 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2349 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2350 // the RET instruction and handled by the FP Stackifier.
2351 if (VA.getLocReg() == X86::FP0 ||
2352 VA.getLocReg() == X86::FP1) {
2353 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2354 // change the value to the FP stack register class.
2355 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2356 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2357 RetOps.push_back(ValToCopy);
2358 // Don't emit a copytoreg.
2362 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2363 // which is returned in RAX / RDX.
2364 if (Subtarget.is64Bit()) {
2365 if (ValVT == MVT::x86mmx) {
2366 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2367 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2368 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2370 // If we don't have SSE2 available, convert to v4f32 so the generated
2371 // register is legal.
2372 if (!Subtarget.hasSSE2())
2373 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2378 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2380 if (VA.needsCustom()) {
2381 assert(VA.getValVT() == MVT::v64i1 &&
2382 "Currently the only custom case is when we split v64i1 to 2 regs");
2384 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2387 assert(2 == RegsToPass.size() &&
2388 "Expecting two registers after Pass64BitArgInRegs");
2390 // Add the second register to the CalleeSaveDisableRegs list.
2391 if (ShouldDisableCalleeSavedRegister)
2392 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2394 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2397 // Add nodes to the DAG and add the values into the RetOps list
2398 for (auto &Reg : RegsToPass) {
2399 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2400 Flag = Chain.getValue(1);
2401 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2405 // Swift calling convention does not require we copy the sret argument
2406 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2408 // All x86 ABIs require that for returning structs by value we copy
2409 // the sret argument into %rax/%eax (depending on ABI) for the return.
2410 // We saved the argument into a virtual register in the entry block,
2411 // so now we copy the value out and into %rax/%eax.
2413 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2414 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2415 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2416 // either case FuncInfo->setSRetReturnReg() will have been called.
2417 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2418 // When we have both sret and another return value, we should use the
2419 // original Chain stored in RetOps[0], instead of the current Chain updated
2420 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2422 // For the case of sret and another return value, we have
2423 // Chain_0 at the function entry
2424 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2425 // If we use Chain_1 in getCopyFromReg, we will have
2426 // Val = getCopyFromReg(Chain_1)
2427 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2429 // getCopyToReg(Chain_0) will be glued together with
2430 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2431 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2432 // Data dependency from Unit B to Unit A due to usage of Val in
2433 // getCopyToReg(Chain_1, Val)
2434 // Chain dependency from Unit A to Unit B
2436 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2437 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2438 getPointerTy(MF.getDataLayout()));
2441 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2442 X86::RAX : X86::EAX;
2443 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2444 Flag = Chain.getValue(1);
2446 // RAX/EAX now acts like a return value.
2448 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2450 // Add the returned register to the CalleeSaveDisableRegs list.
2451 if (ShouldDisableCalleeSavedRegister)
2452 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2455 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2456 const MCPhysReg *I =
2457 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2460 if (X86::GR64RegClass.contains(*I))
2461 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2463 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2467 RetOps[0] = Chain; // Update chain.
2469 // Add the flag if we have it.
2471 RetOps.push_back(Flag);
2473 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2474 if (CallConv == CallingConv::X86_INTR)
2475 opcode = X86ISD::IRET;
2476 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2479 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2480 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2483 SDValue TCChain = Chain;
2484 SDNode *Copy = *N->use_begin();
2485 if (Copy->getOpcode() == ISD::CopyToReg) {
2486 // If the copy has a glue operand, we conservatively assume it isn't safe to
2487 // perform a tail call.
2488 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2490 TCChain = Copy->getOperand(0);
2491 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2494 bool HasRet = false;
2495 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2497 if (UI->getOpcode() != X86ISD::RET_FLAG)
2499 // If we are returning more than one value, we can definitely
2500 // not make a tail call see PR19530
2501 if (UI->getNumOperands() > 4)
2503 if (UI->getNumOperands() == 4 &&
2504 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2516 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2517 ISD::NodeType ExtendKind) const {
2518 MVT ReturnMVT = MVT::i32;
2520 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2521 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2522 // The ABI does not require i1, i8 or i16 to be extended.
2524 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2525 // always extending i8/i16 return values, so keep doing that for now.
2527 ReturnMVT = MVT::i8;
2530 EVT MinVT = getRegisterType(Context, ReturnMVT);
2531 return VT.bitsLT(MinVT) ? MinVT : VT;
2534 /// Reads two 32 bit registers and creates a 64 bit mask value.
2535 /// \param VA The current 32 bit value that need to be assigned.
2536 /// \param NextVA The next 32 bit value that need to be assigned.
2537 /// \param Root The parent DAG node.
2538 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2539 /// glue purposes. In the case the DAG is already using
2540 /// physical register instead of virtual, we should glue
2541 /// our new SDValue to InFlag SDvalue.
2542 /// \return a new SDvalue of size 64bit.
2543 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2544 SDValue &Root, SelectionDAG &DAG,
2545 const SDLoc &Dl, const X86Subtarget &Subtarget,
2546 SDValue *InFlag = nullptr) {
2547 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2548 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2549 assert(VA.getValVT() == MVT::v64i1 &&
2550 "Expecting first location of 64 bit width type");
2551 assert(NextVA.getValVT() == VA.getValVT() &&
2552 "The locations should have the same type");
2553 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2554 "The values should reside in two registers");
2558 SDValue ArgValueLo, ArgValueHi;
2560 MachineFunction &MF = DAG.getMachineFunction();
2561 const TargetRegisterClass *RC = &X86::GR32RegClass;
2563 // Read a 32 bit value from the registers.
2564 if (nullptr == InFlag) {
2565 // When no physical register is present,
2566 // create an intermediate virtual register.
2567 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2568 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2569 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2570 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2572 // When a physical register is available read the value from it and glue
2573 // the reads together.
2575 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2576 *InFlag = ArgValueLo.getValue(2);
2578 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2579 *InFlag = ArgValueHi.getValue(2);
2582 // Convert the i32 type into v32i1 type.
2583 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2585 // Convert the i32 type into v32i1 type.
2586 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2588 // Concatenate the two values together.
2589 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2592 /// The function will lower a register of various sizes (8/16/32/64)
2593 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2594 /// \returns a DAG node contains the operand after lowering to mask type.
2595 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2596 const EVT &ValLoc, const SDLoc &Dl,
2597 SelectionDAG &DAG) {
2598 SDValue ValReturned = ValArg;
2600 if (ValVT == MVT::v1i1)
2601 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2603 if (ValVT == MVT::v64i1) {
2604 // In 32 bit machine, this case is handled by getv64i1Argument
2605 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2606 // In 64 bit machine, There is no need to truncate the value only bitcast
2609 switch (ValVT.getSimpleVT().SimpleTy) {
2620 llvm_unreachable("Expecting a vector of i1 types");
2623 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2625 return DAG.getBitcast(ValVT, ValReturned);
2628 /// Lower the result values of a call into the
2629 /// appropriate copies out of appropriate physical registers.
2631 SDValue X86TargetLowering::LowerCallResult(
2632 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2633 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2634 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2635 uint32_t *RegMask) const {
2637 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2638 // Assign locations to each value returned by this call.
2639 SmallVector<CCValAssign, 16> RVLocs;
2640 bool Is64Bit = Subtarget.is64Bit();
2641 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2643 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2645 // Copy all of the result registers out of their specified physreg.
2646 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2648 CCValAssign &VA = RVLocs[I];
2649 EVT CopyVT = VA.getLocVT();
2651 // In some calling conventions we need to remove the used registers
2652 // from the register mask.
2654 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2655 SubRegs.isValid(); ++SubRegs)
2656 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2659 // If this is x86-64, and we disabled SSE, we can't return FP values
2660 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2661 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2662 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2663 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2666 // If we prefer to use the value in xmm registers, copy it out as f80 and
2667 // use a truncate to move it from fp stack reg to xmm reg.
2668 bool RoundAfterCopy = false;
2669 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2670 isScalarFPTypeInSSEReg(VA.getValVT())) {
2671 if (!Subtarget.hasX87())
2672 report_fatal_error("X87 register return with X87 disabled");
2674 RoundAfterCopy = (CopyVT != VA.getLocVT());
2678 if (VA.needsCustom()) {
2679 assert(VA.getValVT() == MVT::v64i1 &&
2680 "Currently the only custom case is when we split v64i1 to 2 regs");
2682 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2684 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2686 Val = Chain.getValue(0);
2687 InFlag = Chain.getValue(2);
2691 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2692 // This truncation won't change the value.
2693 DAG.getIntPtrConstant(1, dl));
2695 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2696 if (VA.getValVT().isVector() &&
2697 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2698 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2699 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2700 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2702 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2705 InVals.push_back(Val);
2711 //===----------------------------------------------------------------------===//
2712 // C & StdCall & Fast Calling Convention implementation
2713 //===----------------------------------------------------------------------===//
2714 // StdCall calling convention seems to be standard for many Windows' API
2715 // routines and around. It differs from C calling convention just a little:
2716 // callee should clean up the stack, not caller. Symbols should be also
2717 // decorated in some fancy way :) It doesn't support any vector arguments.
2718 // For info on fast calling convention see Fast Calling Convention (tail call)
2719 // implementation LowerX86_32FastCCCallTo.
2721 /// CallIsStructReturn - Determines whether a call uses struct return
2723 enum StructReturnType {
2728 static StructReturnType
2729 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2731 return NotStructReturn;
2733 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2734 if (!Flags.isSRet())
2735 return NotStructReturn;
2736 if (Flags.isInReg() || IsMCU)
2737 return RegStructReturn;
2738 return StackStructReturn;
2741 /// Determines whether a function uses struct return semantics.
2742 static StructReturnType
2743 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2745 return NotStructReturn;
2747 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2748 if (!Flags.isSRet())
2749 return NotStructReturn;
2750 if (Flags.isInReg() || IsMCU)
2751 return RegStructReturn;
2752 return StackStructReturn;
2755 /// Make a copy of an aggregate at address specified by "Src" to address
2756 /// "Dst" with size and alignment information specified by the specific
2757 /// parameter attribute. The copy will be passed as a byval function parameter.
2758 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2759 SDValue Chain, ISD::ArgFlagsTy Flags,
2760 SelectionDAG &DAG, const SDLoc &dl) {
2761 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2763 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2764 /*isVolatile*/false, /*AlwaysInline=*/true,
2765 /*isTailCall*/false,
2766 MachinePointerInfo(), MachinePointerInfo());
2769 /// Return true if the calling convention is one that we can guarantee TCO for.
2770 static bool canGuaranteeTCO(CallingConv::ID CC) {
2771 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2772 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2773 CC == CallingConv::HHVM);
2776 /// Return true if we might ever do TCO for calls with this calling convention.
2777 static bool mayTailCallThisCC(CallingConv::ID CC) {
2779 // C calling conventions:
2780 case CallingConv::C:
2781 case CallingConv::Win64:
2782 case CallingConv::X86_64_SysV:
2783 // Callee pop conventions:
2784 case CallingConv::X86_ThisCall:
2785 case CallingConv::X86_StdCall:
2786 case CallingConv::X86_VectorCall:
2787 case CallingConv::X86_FastCall:
2790 return canGuaranteeTCO(CC);
2794 /// Return true if the function is being made into a tailcall target by
2795 /// changing its ABI.
2796 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2797 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2800 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2802 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2803 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2806 ImmutableCallSite CS(CI);
2807 CallingConv::ID CalleeCC = CS.getCallingConv();
2808 if (!mayTailCallThisCC(CalleeCC))
2815 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2816 const SmallVectorImpl<ISD::InputArg> &Ins,
2817 const SDLoc &dl, SelectionDAG &DAG,
2818 const CCValAssign &VA,
2819 MachineFrameInfo &MFI, unsigned i) const {
2820 // Create the nodes corresponding to a load from this parameter slot.
2821 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2822 bool AlwaysUseMutable = shouldGuaranteeTCO(
2823 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2824 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2826 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2828 // If value is passed by pointer we have address passed instead of the value
2829 // itself. No need to extend if the mask value and location share the same
2831 bool ExtendedInMem =
2832 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2833 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2835 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2836 ValVT = VA.getLocVT();
2838 ValVT = VA.getValVT();
2840 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2841 // taken by a return address.
2843 if (CallConv == CallingConv::X86_INTR) {
2844 // X86 interrupts may take one or two arguments.
2845 // On the stack there will be no return address as in regular call.
2846 // Offset of last argument need to be set to -4/-8 bytes.
2847 // Where offset of the first argument out of two, should be set to 0 bytes.
2848 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2849 if (Subtarget.is64Bit() && Ins.size() == 2) {
2850 // The stack pointer needs to be realigned for 64 bit handlers with error
2851 // code, so the argument offset changes by 8 bytes.
2856 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2857 // changed with more analysis.
2858 // In case of tail call optimization mark all arguments mutable. Since they
2859 // could be overwritten by lowering of arguments in case of a tail call.
2860 if (Flags.isByVal()) {
2861 unsigned Bytes = Flags.getByValSize();
2862 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2864 // FIXME: For now, all byval parameter objects are marked as aliasing. This
2865 // can be improved with deeper analysis.
2866 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
2867 /*isAliased=*/true);
2868 // Adjust SP offset of interrupt parameter.
2869 if (CallConv == CallingConv::X86_INTR) {
2870 MFI.setObjectOffset(FI, Offset);
2872 return DAG.getFrameIndex(FI, PtrVT);
2875 // This is an argument in memory. We might be able to perform copy elision.
2876 if (Flags.isCopyElisionCandidate()) {
2877 EVT ArgVT = Ins[i].ArgVT;
2879 if (Ins[i].PartOffset == 0) {
2880 // If this is a one-part value or the first part of a multi-part value,
2881 // create a stack object for the entire argument value type and return a
2882 // load from our portion of it. This assumes that if the first part of an
2883 // argument is in memory, the rest will also be in memory.
2884 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2885 /*Immutable=*/false);
2886 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2888 ValVT, dl, Chain, PartAddr,
2889 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2891 // This is not the first piece of an argument in memory. See if there is
2892 // already a fixed stack object including this offset. If so, assume it
2893 // was created by the PartOffset == 0 branch above and create a load from
2894 // the appropriate offset into it.
2895 int64_t PartBegin = VA.getLocMemOffset();
2896 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2897 int FI = MFI.getObjectIndexBegin();
2898 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2899 int64_t ObjBegin = MFI.getObjectOffset(FI);
2900 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2901 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2904 if (MFI.isFixedObjectIndex(FI)) {
2906 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2907 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2909 ValVT, dl, Chain, Addr,
2910 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2911 Ins[i].PartOffset));
2916 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2917 VA.getLocMemOffset(), isImmutable);
2919 // Set SExt or ZExt flag.
2920 if (VA.getLocInfo() == CCValAssign::ZExt) {
2921 MFI.setObjectZExt(FI, true);
2922 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2923 MFI.setObjectSExt(FI, true);
2926 // Adjust SP offset of interrupt parameter.
2927 if (CallConv == CallingConv::X86_INTR) {
2928 MFI.setObjectOffset(FI, Offset);
2931 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2932 SDValue Val = DAG.getLoad(
2933 ValVT, dl, Chain, FIN,
2934 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2935 return ExtendedInMem
2936 ? (VA.getValVT().isVector()
2937 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2938 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2942 // FIXME: Get this from tablegen.
2943 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2944 const X86Subtarget &Subtarget) {
2945 assert(Subtarget.is64Bit());
2947 if (Subtarget.isCallingConvWin64(CallConv)) {
2948 static const MCPhysReg GPR64ArgRegsWin64[] = {
2949 X86::RCX, X86::RDX, X86::R8, X86::R9
2951 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2954 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2955 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2957 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2960 // FIXME: Get this from tablegen.
2961 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2962 CallingConv::ID CallConv,
2963 const X86Subtarget &Subtarget) {
2964 assert(Subtarget.is64Bit());
2965 if (Subtarget.isCallingConvWin64(CallConv)) {
2966 // The XMM registers which might contain var arg parameters are shadowed
2967 // in their paired GPR. So we only need to save the GPR to their home
2969 // TODO: __vectorcall will change this.
2973 const Function &F = MF.getFunction();
2974 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2975 bool isSoftFloat = Subtarget.useSoftFloat();
2976 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2977 "SSE register cannot be used when SSE is disabled!");
2978 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2979 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2983 static const MCPhysReg XMMArgRegs64Bit[] = {
2984 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2985 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2987 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2991 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
2992 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2993 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2994 return A.getValNo() < B.getValNo();
2999 SDValue X86TargetLowering::LowerFormalArguments(
3000 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3001 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3002 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3003 MachineFunction &MF = DAG.getMachineFunction();
3004 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3005 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3007 const Function &F = MF.getFunction();
3008 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3009 F.getName() == "main")
3010 FuncInfo->setForceFramePointer(true);
3012 MachineFrameInfo &MFI = MF.getFrameInfo();
3013 bool Is64Bit = Subtarget.is64Bit();
3014 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3017 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3018 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3020 if (CallConv == CallingConv::X86_INTR) {
3021 bool isLegal = Ins.size() == 1 ||
3022 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3023 (!Is64Bit && Ins[1].VT == MVT::i32)));
3025 report_fatal_error("X86 interrupts may take one or two arguments");
3028 // Assign locations to all of the incoming arguments.
3029 SmallVector<CCValAssign, 16> ArgLocs;
3030 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3032 // Allocate shadow area for Win64.
3034 CCInfo.AllocateStack(32, 8);
3036 CCInfo.AnalyzeArguments(Ins, CC_X86);
3038 // In vectorcall calling convention a second pass is required for the HVA
3040 if (CallingConv::X86_VectorCall == CallConv) {
3041 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3044 // The next loop assumes that the locations are in the same order of the
3046 assert(isSortedByValueNo(ArgLocs) &&
3047 "Argument Location list must be sorted before lowering");
3050 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3052 assert(InsIndex < Ins.size() && "Invalid Ins index");
3053 CCValAssign &VA = ArgLocs[I];
3055 if (VA.isRegLoc()) {
3056 EVT RegVT = VA.getLocVT();
3057 if (VA.needsCustom()) {
3059 VA.getValVT() == MVT::v64i1 &&
3060 "Currently the only custom case is when we split v64i1 to 2 regs");
3062 // v64i1 values, in regcall calling convention, that are
3063 // compiled to 32 bit arch, are split up into two registers.
3065 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3067 const TargetRegisterClass *RC;
3068 if (RegVT == MVT::i8)
3069 RC = &X86::GR8RegClass;
3070 else if (RegVT == MVT::i16)
3071 RC = &X86::GR16RegClass;
3072 else if (RegVT == MVT::i32)
3073 RC = &X86::GR32RegClass;
3074 else if (Is64Bit && RegVT == MVT::i64)
3075 RC = &X86::GR64RegClass;
3076 else if (RegVT == MVT::f32)
3077 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3078 else if (RegVT == MVT::f64)
3079 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3080 else if (RegVT == MVT::f80)
3081 RC = &X86::RFP80RegClass;
3082 else if (RegVT == MVT::f128)
3083 RC = &X86::VR128RegClass;
3084 else if (RegVT.is512BitVector())
3085 RC = &X86::VR512RegClass;
3086 else if (RegVT.is256BitVector())
3087 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3088 else if (RegVT.is128BitVector())
3089 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3090 else if (RegVT == MVT::x86mmx)
3091 RC = &X86::VR64RegClass;
3092 else if (RegVT == MVT::v1i1)
3093 RC = &X86::VK1RegClass;
3094 else if (RegVT == MVT::v8i1)
3095 RC = &X86::VK8RegClass;
3096 else if (RegVT == MVT::v16i1)
3097 RC = &X86::VK16RegClass;
3098 else if (RegVT == MVT::v32i1)
3099 RC = &X86::VK32RegClass;
3100 else if (RegVT == MVT::v64i1)
3101 RC = &X86::VK64RegClass;
3103 llvm_unreachable("Unknown argument type!");
3105 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3106 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3109 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3110 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3112 if (VA.getLocInfo() == CCValAssign::SExt)
3113 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3114 DAG.getValueType(VA.getValVT()));
3115 else if (VA.getLocInfo() == CCValAssign::ZExt)
3116 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3117 DAG.getValueType(VA.getValVT()));
3118 else if (VA.getLocInfo() == CCValAssign::BCvt)
3119 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3121 if (VA.isExtInLoc()) {
3122 // Handle MMX values passed in XMM regs.
3123 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3124 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3125 else if (VA.getValVT().isVector() &&
3126 VA.getValVT().getScalarType() == MVT::i1 &&
3127 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3128 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3129 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3130 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3132 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3135 assert(VA.isMemLoc());
3137 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3140 // If value is passed via pointer - do a load.
3141 if (VA.getLocInfo() == CCValAssign::Indirect)
3143 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3145 InVals.push_back(ArgValue);
3148 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3149 // Swift calling convention does not require we copy the sret argument
3150 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3151 if (CallConv == CallingConv::Swift)
3154 // All x86 ABIs require that for returning structs by value we copy the
3155 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3156 // the argument into a virtual register so that we can access it from the
3158 if (Ins[I].Flags.isSRet()) {
3159 unsigned Reg = FuncInfo->getSRetReturnReg();
3161 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3162 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3163 FuncInfo->setSRetReturnReg(Reg);
3165 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3166 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3171 unsigned StackSize = CCInfo.getNextStackOffset();
3172 // Align stack specially for tail calls.
3173 if (shouldGuaranteeTCO(CallConv,
3174 MF.getTarget().Options.GuaranteedTailCallOpt))
3175 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3177 // If the function takes variable number of arguments, make a frame index for
3178 // the start of the first vararg value... for expansion of llvm.va_start. We
3179 // can skip this if there are no va_start calls.
3180 if (MFI.hasVAStart() &&
3181 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3182 CallConv != CallingConv::X86_ThisCall))) {
3183 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3186 // Figure out if XMM registers are in use.
3187 assert(!(Subtarget.useSoftFloat() &&
3188 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3189 "SSE register cannot be used when SSE is disabled!");
3191 // 64-bit calling conventions support varargs and register parameters, so we
3192 // have to do extra work to spill them in the prologue.
3193 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3194 // Find the first unallocated argument registers.
3195 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3196 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3197 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3198 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3199 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3200 "SSE register cannot be used when SSE is disabled!");
3202 // Gather all the live in physical registers.
3203 SmallVector<SDValue, 6> LiveGPRs;
3204 SmallVector<SDValue, 8> LiveXMMRegs;
3206 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3207 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3209 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3211 if (!ArgXMMs.empty()) {
3212 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3213 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3214 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3215 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3216 LiveXMMRegs.push_back(
3217 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3222 // Get to the caller-allocated home save location. Add 8 to account
3223 // for the return address.
3224 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3225 FuncInfo->setRegSaveFrameIndex(
3226 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3227 // Fixup to set vararg frame on shadow area (4 x i64).
3229 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3231 // For X86-64, if there are vararg parameters that are passed via
3232 // registers, then we must store them to their spots on the stack so
3233 // they may be loaded by dereferencing the result of va_next.
3234 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3235 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3236 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3237 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3240 // Store the integer parameter registers.
3241 SmallVector<SDValue, 8> MemOps;
3242 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3243 getPointerTy(DAG.getDataLayout()));
3244 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3245 for (SDValue Val : LiveGPRs) {
3246 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3247 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3249 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3250 MachinePointerInfo::getFixedStack(
3251 DAG.getMachineFunction(),
3252 FuncInfo->getRegSaveFrameIndex(), Offset));
3253 MemOps.push_back(Store);
3257 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3258 // Now store the XMM (fp + vector) parameter registers.
3259 SmallVector<SDValue, 12> SaveXMMOps;
3260 SaveXMMOps.push_back(Chain);
3261 SaveXMMOps.push_back(ALVal);
3262 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3263 FuncInfo->getRegSaveFrameIndex(), dl));
3264 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3265 FuncInfo->getVarArgsFPOffset(), dl));
3266 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3268 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3269 MVT::Other, SaveXMMOps));
3272 if (!MemOps.empty())
3273 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3276 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3277 // Find the largest legal vector type.
3278 MVT VecVT = MVT::Other;
3279 // FIXME: Only some x86_32 calling conventions support AVX512.
3280 if (Subtarget.hasAVX512() &&
3281 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3282 CallConv == CallingConv::Intel_OCL_BI)))
3283 VecVT = MVT::v16f32;
3284 else if (Subtarget.hasAVX())
3286 else if (Subtarget.hasSSE2())
3289 // We forward some GPRs and some vector types.
3290 SmallVector<MVT, 2> RegParmTypes;
3291 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3292 RegParmTypes.push_back(IntVT);
3293 if (VecVT != MVT::Other)
3294 RegParmTypes.push_back(VecVT);
3296 // Compute the set of forwarded registers. The rest are scratch.
3297 SmallVectorImpl<ForwardedRegister> &Forwards =
3298 FuncInfo->getForwardedMustTailRegParms();
3299 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3301 // Conservatively forward AL on x86_64, since it might be used for varargs.
3302 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3303 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3304 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3307 // Copy all forwards from physical to virtual registers.
3308 for (ForwardedRegister &F : Forwards) {
3309 // FIXME: Can we use a less constrained schedule?
3310 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3311 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3312 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3316 // Some CCs need callee pop.
3317 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3318 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3319 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3320 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3321 // X86 interrupts must pop the error code (and the alignment padding) if
3323 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3325 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3326 // If this is an sret function, the return should pop the hidden pointer.
3327 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3328 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3329 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3330 FuncInfo->setBytesToPopOnReturn(4);
3334 // RegSaveFrameIndex is X86-64 only.
3335 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3336 if (CallConv == CallingConv::X86_FastCall ||
3337 CallConv == CallingConv::X86_ThisCall)
3338 // fastcc functions can't have varargs.
3339 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3342 FuncInfo->setArgumentStackSize(StackSize);
3344 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3345 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3346 if (Personality == EHPersonality::CoreCLR) {
3348 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3349 // that we'd prefer this slot be allocated towards the bottom of the frame
3350 // (i.e. near the stack pointer after allocating the frame). Every
3351 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3352 // offset from the bottom of this and each funclet's frame must be the
3353 // same, so the size of funclets' (mostly empty) frames is dictated by
3354 // how far this slot is from the bottom (since they allocate just enough
3355 // space to accommodate holding this slot at the correct offset).
3356 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3357 EHInfo->PSPSymFrameIdx = PSPSymFI;
3361 if (CallConv == CallingConv::X86_RegCall ||
3362 F.hasFnAttribute("no_caller_saved_registers")) {
3363 MachineRegisterInfo &MRI = MF.getRegInfo();
3364 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3365 MRI.disableCalleeSavedRegister(Pair.first);
3371 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3372 SDValue Arg, const SDLoc &dl,
3374 const CCValAssign &VA,
3375 ISD::ArgFlagsTy Flags) const {
3376 unsigned LocMemOffset = VA.getLocMemOffset();
3377 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3378 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3380 if (Flags.isByVal())
3381 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3383 return DAG.getStore(
3384 Chain, dl, Arg, PtrOff,
3385 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3388 /// Emit a load of return address if tail call
3389 /// optimization is performed and it is required.
3390 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3391 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3392 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3393 // Adjust the Return address stack slot.
3394 EVT VT = getPointerTy(DAG.getDataLayout());
3395 OutRetAddr = getReturnAddressFrameIndex(DAG);
3397 // Load the "old" Return address.
3398 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3399 return SDValue(OutRetAddr.getNode(), 1);
3402 /// Emit a store of the return address if tail call
3403 /// optimization is performed and it is required (FPDiff!=0).
3404 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3405 SDValue Chain, SDValue RetAddrFrIdx,
3406 EVT PtrVT, unsigned SlotSize,
3407 int FPDiff, const SDLoc &dl) {
3408 // Store the return address to the appropriate stack slot.
3409 if (!FPDiff) return Chain;
3410 // Calculate the new stack slot for the return address.
3411 int NewReturnAddrFI =
3412 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3414 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3415 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3416 MachinePointerInfo::getFixedStack(
3417 DAG.getMachineFunction(), NewReturnAddrFI));
3421 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3422 /// operation of specified width.
3423 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3425 unsigned NumElems = VT.getVectorNumElements();
3426 SmallVector<int, 8> Mask;
3427 Mask.push_back(NumElems);
3428 for (unsigned i = 1; i != NumElems; ++i)
3430 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3434 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3435 SmallVectorImpl<SDValue> &InVals) const {
3436 SelectionDAG &DAG = CLI.DAG;
3438 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3439 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3440 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3441 SDValue Chain = CLI.Chain;
3442 SDValue Callee = CLI.Callee;
3443 CallingConv::ID CallConv = CLI.CallConv;
3444 bool &isTailCall = CLI.IsTailCall;
3445 bool isVarArg = CLI.IsVarArg;
3447 MachineFunction &MF = DAG.getMachineFunction();
3448 bool Is64Bit = Subtarget.is64Bit();
3449 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3450 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3451 bool IsSibcall = false;
3452 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3453 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3454 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3455 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3456 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3457 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3458 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3460 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3461 const Module *M = MF.getMMI().getModule();
3462 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3464 if (CallConv == CallingConv::X86_INTR)
3465 report_fatal_error("X86 interrupts may not be called directly");
3467 if (Attr.getValueAsString() == "true")
3470 if (Subtarget.isPICStyleGOT() &&
3471 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3472 // If we are using a GOT, disable tail calls to external symbols with
3473 // default visibility. Tail calling such a symbol requires using a GOT
3474 // relocation, which forces early binding of the symbol. This breaks code
3475 // that require lazy function symbol resolution. Using musttail or
3476 // GuaranteedTailCallOpt will override this.
3477 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3478 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3479 G->getGlobal()->hasDefaultVisibility()))
3483 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3485 // Force this to be a tail call. The verifier rules are enough to ensure
3486 // that we can lower this successfully without moving the return address
3489 } else if (isTailCall) {
3490 // Check if it's really possible to do a tail call.
3491 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3492 isVarArg, SR != NotStructReturn,
3493 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3494 Outs, OutVals, Ins, DAG);
3496 // Sibcalls are automatically detected tailcalls which do not require
3498 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3505 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3506 "Var args not supported with calling convention fastcc, ghc or hipe");
3508 // Analyze operands of the call, assigning locations to each operand.
3509 SmallVector<CCValAssign, 16> ArgLocs;
3510 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3512 // Allocate shadow area for Win64.
3514 CCInfo.AllocateStack(32, 8);
3516 CCInfo.AnalyzeArguments(Outs, CC_X86);
3518 // In vectorcall calling convention a second pass is required for the HVA
3520 if (CallingConv::X86_VectorCall == CallConv) {
3521 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3524 // Get a count of how many bytes are to be pushed on the stack.
3525 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3527 // This is a sibcall. The memory operands are available in caller's
3528 // own caller's stack.
3530 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3531 canGuaranteeTCO(CallConv))
3532 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3535 if (isTailCall && !IsSibcall && !IsMustTail) {
3536 // Lower arguments at fp - stackoffset + fpdiff.
3537 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3539 FPDiff = NumBytesCallerPushed - NumBytes;
3541 // Set the delta of movement of the returnaddr stackslot.
3542 // But only set if delta is greater than previous delta.
3543 if (FPDiff < X86Info->getTCReturnAddrDelta())
3544 X86Info->setTCReturnAddrDelta(FPDiff);
3547 unsigned NumBytesToPush = NumBytes;
3548 unsigned NumBytesToPop = NumBytes;
3550 // If we have an inalloca argument, all stack space has already been allocated
3551 // for us and be right at the top of the stack. We don't support multiple
3552 // arguments passed in memory when using inalloca.
3553 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3555 if (!ArgLocs.back().isMemLoc())
3556 report_fatal_error("cannot use inalloca attribute on a register "
3558 if (ArgLocs.back().getLocMemOffset() != 0)
3559 report_fatal_error("any parameter with the inalloca attribute must be "
3560 "the only memory argument");
3564 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3565 NumBytes - NumBytesToPush, dl);
3567 SDValue RetAddrFrIdx;
3568 // Load return address for tail calls.
3569 if (isTailCall && FPDiff)
3570 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3571 Is64Bit, FPDiff, dl);
3573 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3574 SmallVector<SDValue, 8> MemOpChains;
3577 // The next loop assumes that the locations are in the same order of the
3579 assert(isSortedByValueNo(ArgLocs) &&
3580 "Argument Location list must be sorted before lowering");
3582 // Walk the register/memloc assignments, inserting copies/loads. In the case
3583 // of tail call optimization arguments are handle later.
3584 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3585 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3587 assert(OutIndex < Outs.size() && "Invalid Out index");
3588 // Skip inalloca arguments, they have already been written.
3589 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3590 if (Flags.isInAlloca())
3593 CCValAssign &VA = ArgLocs[I];
3594 EVT RegVT = VA.getLocVT();
3595 SDValue Arg = OutVals[OutIndex];
3596 bool isByVal = Flags.isByVal();
3598 // Promote the value if needed.
3599 switch (VA.getLocInfo()) {
3600 default: llvm_unreachable("Unknown loc info!");
3601 case CCValAssign::Full: break;
3602 case CCValAssign::SExt:
3603 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3605 case CCValAssign::ZExt:
3606 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3608 case CCValAssign::AExt:
3609 if (Arg.getValueType().isVector() &&
3610 Arg.getValueType().getVectorElementType() == MVT::i1)
3611 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3612 else if (RegVT.is128BitVector()) {
3613 // Special case: passing MMX values in XMM registers.
3614 Arg = DAG.getBitcast(MVT::i64, Arg);
3615 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3616 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3618 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3620 case CCValAssign::BCvt:
3621 Arg = DAG.getBitcast(RegVT, Arg);
3623 case CCValAssign::Indirect: {
3624 // Store the argument.
3625 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3626 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3627 Chain = DAG.getStore(
3628 Chain, dl, Arg, SpillSlot,
3629 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3635 if (VA.needsCustom()) {
3636 assert(VA.getValVT() == MVT::v64i1 &&
3637 "Currently the only custom case is when we split v64i1 to 2 regs");
3638 // Split v64i1 value into two registers
3639 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3641 } else if (VA.isRegLoc()) {
3642 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3643 if (isVarArg && IsWin64) {
3644 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3645 // shadow reg if callee is a varargs function.
3646 unsigned ShadowReg = 0;
3647 switch (VA.getLocReg()) {
3648 case X86::XMM0: ShadowReg = X86::RCX; break;
3649 case X86::XMM1: ShadowReg = X86::RDX; break;
3650 case X86::XMM2: ShadowReg = X86::R8; break;
3651 case X86::XMM3: ShadowReg = X86::R9; break;
3654 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3656 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3657 assert(VA.isMemLoc());
3658 if (!StackPtr.getNode())
3659 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3660 getPointerTy(DAG.getDataLayout()));
3661 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3662 dl, DAG, VA, Flags));
3666 if (!MemOpChains.empty())
3667 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3669 if (Subtarget.isPICStyleGOT()) {
3670 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3673 RegsToPass.push_back(std::make_pair(
3674 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3675 getPointerTy(DAG.getDataLayout()))));
3677 // If we are tail calling and generating PIC/GOT style code load the
3678 // address of the callee into ECX. The value in ecx is used as target of
3679 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3680 // for tail calls on PIC/GOT architectures. Normally we would just put the
3681 // address of GOT into ebx and then call target@PLT. But for tail calls
3682 // ebx would be restored (since ebx is callee saved) before jumping to the
3685 // Note: The actual moving to ECX is done further down.
3686 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3687 if (G && !G->getGlobal()->hasLocalLinkage() &&
3688 G->getGlobal()->hasDefaultVisibility())
3689 Callee = LowerGlobalAddress(Callee, DAG);
3690 else if (isa<ExternalSymbolSDNode>(Callee))
3691 Callee = LowerExternalSymbol(Callee, DAG);
3695 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3696 // From AMD64 ABI document:
3697 // For calls that may call functions that use varargs or stdargs
3698 // (prototype-less calls or calls to functions containing ellipsis (...) in
3699 // the declaration) %al is used as hidden argument to specify the number
3700 // of SSE registers used. The contents of %al do not need to match exactly
3701 // the number of registers, but must be an ubound on the number of SSE
3702 // registers used and is in the range 0 - 8 inclusive.
3704 // Count the number of XMM registers allocated.
3705 static const MCPhysReg XMMArgRegs[] = {
3706 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3707 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3709 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3710 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3711 && "SSE registers cannot be used when SSE is disabled");
3713 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3714 DAG.getConstant(NumXMMRegs, dl,
3718 if (isVarArg && IsMustTail) {
3719 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3720 for (const auto &F : Forwards) {
3721 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3722 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3726 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3727 // don't need this because the eligibility check rejects calls that require
3728 // shuffling arguments passed in memory.
3729 if (!IsSibcall && isTailCall) {
3730 // Force all the incoming stack arguments to be loaded from the stack
3731 // before any new outgoing arguments are stored to the stack, because the
3732 // outgoing stack slots may alias the incoming argument stack slots, and
3733 // the alias isn't otherwise explicit. This is slightly more conservative
3734 // than necessary, because it means that each store effectively depends
3735 // on every argument instead of just those arguments it would clobber.
3736 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3738 SmallVector<SDValue, 8> MemOpChains2;
3741 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3743 CCValAssign &VA = ArgLocs[I];
3745 if (VA.isRegLoc()) {
3746 if (VA.needsCustom()) {
3747 assert((CallConv == CallingConv::X86_RegCall) &&
3748 "Expecting custom case only in regcall calling convention");
3749 // This means that we are in special case where one argument was
3750 // passed through two register locations - Skip the next location
3757 assert(VA.isMemLoc());
3758 SDValue Arg = OutVals[OutsIndex];
3759 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3760 // Skip inalloca arguments. They don't require any work.
3761 if (Flags.isInAlloca())
3763 // Create frame index.
3764 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3765 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3766 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3767 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3769 if (Flags.isByVal()) {
3770 // Copy relative to framepointer.
3771 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3772 if (!StackPtr.getNode())
3773 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3774 getPointerTy(DAG.getDataLayout()));
3775 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3778 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3782 // Store relative to framepointer.
3783 MemOpChains2.push_back(DAG.getStore(
3784 ArgChain, dl, Arg, FIN,
3785 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3789 if (!MemOpChains2.empty())
3790 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3792 // Store the return address to the appropriate stack slot.
3793 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3794 getPointerTy(DAG.getDataLayout()),
3795 RegInfo->getSlotSize(), FPDiff, dl);
3798 // Build a sequence of copy-to-reg nodes chained together with token chain
3799 // and flag operands which copy the outgoing args into registers.
3801 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3802 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3803 RegsToPass[i].second, InFlag);
3804 InFlag = Chain.getValue(1);
3807 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3808 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3809 // In the 64-bit large code model, we have to make all calls
3810 // through a register, since the call instruction's 32-bit
3811 // pc-relative offset may not be large enough to hold the whole
3813 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3814 // If the callee is a GlobalAddress node (quite common, every direct call
3815 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3817 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3819 // We should use extra load for direct calls to dllimported functions in
3821 const GlobalValue *GV = G->getGlobal();
3822 if (!GV->hasDLLImportStorageClass()) {
3823 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3825 Callee = DAG.getTargetGlobalAddress(
3826 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3828 if (OpFlags == X86II::MO_GOTPCREL) {
3830 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3831 getPointerTy(DAG.getDataLayout()), Callee);
3832 // Add extra indirection
3833 Callee = DAG.getLoad(
3834 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3835 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3838 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3839 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3840 unsigned char OpFlags =
3841 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3843 Callee = DAG.getTargetExternalSymbol(
3844 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3846 if (OpFlags == X86II::MO_GOTPCREL) {
3847 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3848 getPointerTy(DAG.getDataLayout()), Callee);
3849 Callee = DAG.getLoad(
3850 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3851 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3853 } else if (Subtarget.isTarget64BitILP32() &&
3854 Callee->getValueType(0) == MVT::i32) {
3855 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3856 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3859 // Returns a chain & a flag for retval copy to use.
3860 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3861 SmallVector<SDValue, 8> Ops;
3863 if (!IsSibcall && isTailCall) {
3864 Chain = DAG.getCALLSEQ_END(Chain,
3865 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3866 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3867 InFlag = Chain.getValue(1);
3870 Ops.push_back(Chain);
3871 Ops.push_back(Callee);
3874 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3876 // Add argument registers to the end of the list so that they are known live
3878 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3879 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3880 RegsToPass[i].second.getValueType()));
3882 // Add a register mask operand representing the call-preserved registers.
3883 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3884 // set X86_INTR calling convention because it has the same CSR mask
3885 // (same preserved registers).
3886 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3887 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3888 assert(Mask && "Missing call preserved mask for calling convention");
3890 // If this is an invoke in a 32-bit function using a funclet-based
3891 // personality, assume the function clobbers all registers. If an exception
3892 // is thrown, the runtime will not restore CSRs.
3893 // FIXME: Model this more precisely so that we can register allocate across
3894 // the normal edge and spill and fill across the exceptional edge.
3895 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3896 const Function &CallerFn = MF.getFunction();
3897 EHPersonality Pers =
3898 CallerFn.hasPersonalityFn()
3899 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3900 : EHPersonality::Unknown;
3901 if (isFuncletEHPersonality(Pers))
3902 Mask = RegInfo->getNoPreservedMask();
3905 // Define a new register mask from the existing mask.
3906 uint32_t *RegMask = nullptr;
3908 // In some calling conventions we need to remove the used physical registers
3909 // from the reg mask.
3910 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3911 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3913 // Allocate a new Reg Mask and copy Mask.
3914 RegMask = MF.allocateRegMask();
3915 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
3916 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
3918 // Make sure all sub registers of the argument registers are reset
3920 for (auto const &RegPair : RegsToPass)
3921 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3922 SubRegs.isValid(); ++SubRegs)
3923 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3925 // Create the RegMask Operand according to our updated mask.
3926 Ops.push_back(DAG.getRegisterMask(RegMask));
3928 // Create the RegMask Operand according to the static mask.
3929 Ops.push_back(DAG.getRegisterMask(Mask));
3932 if (InFlag.getNode())
3933 Ops.push_back(InFlag);
3937 //// If this is the first return lowered for this function, add the regs
3938 //// to the liveout set for the function.
3939 // This isn't right, although it's probably harmless on x86; liveouts
3940 // should be computed from returns not tail calls. Consider a void
3941 // function making a tail call to a function returning int.
3942 MF.getFrameInfo().setHasTailCall();
3943 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3946 if (HasNoCfCheck && IsCFProtectionSupported) {
3947 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
3949 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3951 InFlag = Chain.getValue(1);
3953 // Create the CALLSEQ_END node.
3954 unsigned NumBytesForCalleeToPop;
3955 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3956 DAG.getTarget().Options.GuaranteedTailCallOpt))
3957 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3958 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3959 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3960 SR == StackStructReturn)
3961 // If this is a call to a struct-return function, the callee
3962 // pops the hidden struct pointer, so we have to push it back.
3963 // This is common for Darwin/X86, Linux & Mingw32 targets.
3964 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3965 NumBytesForCalleeToPop = 4;
3967 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3969 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3970 // No need to reset the stack after the call if the call doesn't return. To
3971 // make the MI verify, we'll pretend the callee does it for us.
3972 NumBytesForCalleeToPop = NumBytes;
3975 // Returns a flag for retval copy to use.
3977 Chain = DAG.getCALLSEQ_END(Chain,
3978 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3979 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3982 InFlag = Chain.getValue(1);
3985 // Handle result values, copying them out of physregs into vregs that we
3987 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3991 //===----------------------------------------------------------------------===//
3992 // Fast Calling Convention (tail call) implementation
3993 //===----------------------------------------------------------------------===//
3995 // Like std call, callee cleans arguments, convention except that ECX is
3996 // reserved for storing the tail called function address. Only 2 registers are
3997 // free for argument passing (inreg). Tail call optimization is performed
3999 // * tailcallopt is enabled
4000 // * caller/callee are fastcc
4001 // On X86_64 architecture with GOT-style position independent code only local
4002 // (within module) calls are supported at the moment.
4003 // To keep the stack aligned according to platform abi the function
4004 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4005 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4006 // If a tail called function callee has more arguments than the caller the
4007 // caller needs to make sure that there is room to move the RETADDR to. This is
4008 // achieved by reserving an area the size of the argument delta right after the
4009 // original RETADDR, but before the saved framepointer or the spilled registers
4010 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4022 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4025 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4026 SelectionDAG& DAG) const {
4027 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4028 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4029 unsigned StackAlignment = TFI.getStackAlignment();
4030 uint64_t AlignMask = StackAlignment - 1;
4031 int64_t Offset = StackSize;
4032 unsigned SlotSize = RegInfo->getSlotSize();
4033 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4034 // Number smaller than 12 so just add the difference.
4035 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4037 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4038 Offset = ((~AlignMask) & Offset) + StackAlignment +
4039 (StackAlignment-SlotSize);
4044 /// Return true if the given stack call argument is already available in the
4045 /// same position (relatively) of the caller's incoming argument stack.
4047 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4048 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4049 const X86InstrInfo *TII, const CCValAssign &VA) {
4050 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4053 // Look through nodes that don't alter the bits of the incoming value.
4054 unsigned Op = Arg.getOpcode();
4055 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4056 Arg = Arg.getOperand(0);
4059 if (Op == ISD::TRUNCATE) {
4060 const SDValue &TruncInput = Arg.getOperand(0);
4061 if (TruncInput.getOpcode() == ISD::AssertZext &&
4062 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4063 Arg.getValueType()) {
4064 Arg = TruncInput.getOperand(0);
4072 if (Arg.getOpcode() == ISD::CopyFromReg) {
4073 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4074 if (!TargetRegisterInfo::isVirtualRegister(VR))
4076 MachineInstr *Def = MRI->getVRegDef(VR);
4079 if (!Flags.isByVal()) {
4080 if (!TII->isLoadFromStackSlot(*Def, FI))
4083 unsigned Opcode = Def->getOpcode();
4084 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4085 Opcode == X86::LEA64_32r) &&
4086 Def->getOperand(1).isFI()) {
4087 FI = Def->getOperand(1).getIndex();
4088 Bytes = Flags.getByValSize();
4092 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4093 if (Flags.isByVal())
4094 // ByVal argument is passed in as a pointer but it's now being
4095 // dereferenced. e.g.
4096 // define @foo(%struct.X* %A) {
4097 // tail call @bar(%struct.X* byval %A)
4100 SDValue Ptr = Ld->getBasePtr();
4101 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4104 FI = FINode->getIndex();
4105 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4106 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4107 FI = FINode->getIndex();
4108 Bytes = Flags.getByValSize();
4112 assert(FI != INT_MAX);
4113 if (!MFI.isFixedObjectIndex(FI))
4116 if (Offset != MFI.getObjectOffset(FI))
4119 // If this is not byval, check that the argument stack object is immutable.
4120 // inalloca and argument copy elision can create mutable argument stack
4121 // objects. Byval objects can be mutated, but a byval call intends to pass the
4123 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4126 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4127 // If the argument location is wider than the argument type, check that any
4128 // extension flags match.
4129 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4130 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4135 return Bytes == MFI.getObjectSize(FI);
4138 /// Check whether the call is eligible for tail call optimization. Targets
4139 /// that want to do tail call optimization should implement this function.
4140 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4141 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4142 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4143 const SmallVectorImpl<ISD::OutputArg> &Outs,
4144 const SmallVectorImpl<SDValue> &OutVals,
4145 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4146 if (!mayTailCallThisCC(CalleeCC))
4149 // If -tailcallopt is specified, make fastcc functions tail-callable.
4150 MachineFunction &MF = DAG.getMachineFunction();
4151 const Function &CallerF = MF.getFunction();
4153 // If the function return type is x86_fp80 and the callee return type is not,
4154 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4155 // perform a tailcall optimization here.
4156 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4159 CallingConv::ID CallerCC = CallerF.getCallingConv();
4160 bool CCMatch = CallerCC == CalleeCC;
4161 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4162 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4164 // Win64 functions have extra shadow space for argument homing. Don't do the
4165 // sibcall if the caller and callee have mismatched expectations for this
4167 if (IsCalleeWin64 != IsCallerWin64)
4170 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4171 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4176 // Look for obvious safe cases to perform tail call optimization that do not
4177 // require ABI changes. This is what gcc calls sibcall.
4179 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4180 // emit a special epilogue.
4181 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4182 if (RegInfo->needsStackRealignment(MF))
4185 // Also avoid sibcall optimization if either caller or callee uses struct
4186 // return semantics.
4187 if (isCalleeStructRet || isCallerStructRet)
4190 // Do not sibcall optimize vararg calls unless all arguments are passed via
4192 LLVMContext &C = *DAG.getContext();
4193 if (isVarArg && !Outs.empty()) {
4194 // Optimizing for varargs on Win64 is unlikely to be safe without
4195 // additional testing.
4196 if (IsCalleeWin64 || IsCallerWin64)
4199 SmallVector<CCValAssign, 16> ArgLocs;
4200 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4202 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4203 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4204 if (!ArgLocs[i].isRegLoc())
4208 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4209 // stack. Therefore, if it's not used by the call it is not safe to optimize
4210 // this into a sibcall.
4211 bool Unused = false;
4212 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4219 SmallVector<CCValAssign, 16> RVLocs;
4220 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4221 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4222 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4223 CCValAssign &VA = RVLocs[i];
4224 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4229 // Check that the call results are passed in the same way.
4230 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4231 RetCC_X86, RetCC_X86))
4233 // The callee has to preserve all registers the caller needs to preserve.
4234 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4235 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4237 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4238 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4242 unsigned StackArgsSize = 0;
4244 // If the callee takes no arguments then go on to check the results of the
4246 if (!Outs.empty()) {
4247 // Check if stack adjustment is needed. For now, do not do this if any
4248 // argument is passed on the stack.
4249 SmallVector<CCValAssign, 16> ArgLocs;
4250 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4252 // Allocate shadow area for Win64
4254 CCInfo.AllocateStack(32, 8);
4256 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4257 StackArgsSize = CCInfo.getNextStackOffset();
4259 if (CCInfo.getNextStackOffset()) {
4260 // Check if the arguments are already laid out in the right way as
4261 // the caller's fixed stack objects.
4262 MachineFrameInfo &MFI = MF.getFrameInfo();
4263 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4264 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4265 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4266 CCValAssign &VA = ArgLocs[i];
4267 SDValue Arg = OutVals[i];
4268 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4269 if (VA.getLocInfo() == CCValAssign::Indirect)
4271 if (!VA.isRegLoc()) {
4272 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4279 bool PositionIndependent = isPositionIndependent();
4280 // If the tailcall address may be in a register, then make sure it's
4281 // possible to register allocate for it. In 32-bit, the call address can
4282 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4283 // callee-saved registers are restored. These happen to be the same
4284 // registers used to pass 'inreg' arguments so watch out for those.
4285 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4286 !isa<ExternalSymbolSDNode>(Callee)) ||
4287 PositionIndependent)) {
4288 unsigned NumInRegs = 0;
4289 // In PIC we need an extra register to formulate the address computation
4291 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4293 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4294 CCValAssign &VA = ArgLocs[i];
4297 unsigned Reg = VA.getLocReg();
4300 case X86::EAX: case X86::EDX: case X86::ECX:
4301 if (++NumInRegs == MaxInRegs)
4308 const MachineRegisterInfo &MRI = MF.getRegInfo();
4309 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4313 bool CalleeWillPop =
4314 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4315 MF.getTarget().Options.GuaranteedTailCallOpt);
4317 if (unsigned BytesToPop =
4318 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4319 // If we have bytes to pop, the callee must pop them.
4320 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4321 if (!CalleePopMatches)
4323 } else if (CalleeWillPop && StackArgsSize > 0) {
4324 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4332 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4333 const TargetLibraryInfo *libInfo) const {
4334 return X86::createFastISel(funcInfo, libInfo);
4337 //===----------------------------------------------------------------------===//
4338 // Other Lowering Hooks
4339 //===----------------------------------------------------------------------===//
4341 static bool MayFoldLoad(SDValue Op) {
4342 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4345 static bool MayFoldIntoStore(SDValue Op) {
4346 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4349 static bool MayFoldIntoZeroExtend(SDValue Op) {
4350 if (Op.hasOneUse()) {
4351 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4352 return (ISD::ZERO_EXTEND == Opcode);
4357 static bool isTargetShuffle(unsigned Opcode) {
4359 default: return false;
4360 case X86ISD::BLENDI:
4361 case X86ISD::PSHUFB:
4362 case X86ISD::PSHUFD:
4363 case X86ISD::PSHUFHW:
4364 case X86ISD::PSHUFLW:
4366 case X86ISD::INSERTPS:
4367 case X86ISD::EXTRQI:
4368 case X86ISD::INSERTQI:
4369 case X86ISD::PALIGNR:
4370 case X86ISD::VSHLDQ:
4371 case X86ISD::VSRLDQ:
4372 case X86ISD::MOVLHPS:
4373 case X86ISD::MOVHLPS:
4374 case X86ISD::MOVSHDUP:
4375 case X86ISD::MOVSLDUP:
4376 case X86ISD::MOVDDUP:
4379 case X86ISD::UNPCKL:
4380 case X86ISD::UNPCKH:
4381 case X86ISD::VBROADCAST:
4382 case X86ISD::VPERMILPI:
4383 case X86ISD::VPERMILPV:
4384 case X86ISD::VPERM2X128:
4385 case X86ISD::SHUF128:
4386 case X86ISD::VPERMIL2:
4387 case X86ISD::VPERMI:
4388 case X86ISD::VPPERM:
4389 case X86ISD::VPERMV:
4390 case X86ISD::VPERMV3:
4391 case X86ISD::VZEXT_MOVL:
4396 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4398 default: return false;
4400 case X86ISD::PSHUFB:
4401 case X86ISD::VPERMILPV:
4402 case X86ISD::VPERMIL2:
4403 case X86ISD::VPPERM:
4404 case X86ISD::VPERMV:
4405 case X86ISD::VPERMV3:
4407 // 'Faux' Target Shuffles.
4414 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4415 MachineFunction &MF = DAG.getMachineFunction();
4416 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4417 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4418 int ReturnAddrIndex = FuncInfo->getRAIndex();
4420 if (ReturnAddrIndex == 0) {
4421 // Set up a frame object for the return address.
4422 unsigned SlotSize = RegInfo->getSlotSize();
4423 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4426 FuncInfo->setRAIndex(ReturnAddrIndex);
4429 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4432 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4433 bool hasSymbolicDisplacement) {
4434 // Offset should fit into 32 bit immediate field.
4435 if (!isInt<32>(Offset))
4438 // If we don't have a symbolic displacement - we don't have any extra
4440 if (!hasSymbolicDisplacement)
4443 // FIXME: Some tweaks might be needed for medium code model.
4444 if (M != CodeModel::Small && M != CodeModel::Kernel)
4447 // For small code model we assume that latest object is 16MB before end of 31
4448 // bits boundary. We may also accept pretty large negative constants knowing
4449 // that all objects are in the positive half of address space.
4450 if (M == CodeModel::Small && Offset < 16*1024*1024)
4453 // For kernel code model we know that all object resist in the negative half
4454 // of 32bits address space. We may not accept negative offsets, since they may
4455 // be just off and we may accept pretty large positive ones.
4456 if (M == CodeModel::Kernel && Offset >= 0)
4462 /// Determines whether the callee is required to pop its own arguments.
4463 /// Callee pop is necessary to support tail calls.
4464 bool X86::isCalleePop(CallingConv::ID CallingConv,
4465 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4466 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4467 // can guarantee TCO.
4468 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4471 switch (CallingConv) {
4474 case CallingConv::X86_StdCall:
4475 case CallingConv::X86_FastCall:
4476 case CallingConv::X86_ThisCall:
4477 case CallingConv::X86_VectorCall:
4482 /// Return true if the condition is an unsigned comparison operation.
4483 static bool isX86CCUnsigned(unsigned X86CC) {
4486 llvm_unreachable("Invalid integer condition!");
4502 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4503 switch (SetCCOpcode) {
4504 default: llvm_unreachable("Invalid integer condition!");
4505 case ISD::SETEQ: return X86::COND_E;
4506 case ISD::SETGT: return X86::COND_G;
4507 case ISD::SETGE: return X86::COND_GE;
4508 case ISD::SETLT: return X86::COND_L;
4509 case ISD::SETLE: return X86::COND_LE;
4510 case ISD::SETNE: return X86::COND_NE;
4511 case ISD::SETULT: return X86::COND_B;
4512 case ISD::SETUGT: return X86::COND_A;
4513 case ISD::SETULE: return X86::COND_BE;
4514 case ISD::SETUGE: return X86::COND_AE;
4518 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4519 /// condition code, returning the condition code and the LHS/RHS of the
4520 /// comparison to make.
4521 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4522 bool isFP, SDValue &LHS, SDValue &RHS,
4523 SelectionDAG &DAG) {
4525 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4526 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4527 // X > -1 -> X == 0, jump !sign.
4528 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4529 return X86::COND_NS;
4531 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4532 // X < 0 -> X == 0, jump on sign.
4535 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4537 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4538 return X86::COND_LE;
4542 return TranslateIntegerX86CC(SetCCOpcode);
4545 // First determine if it is required or is profitable to flip the operands.
4547 // If LHS is a foldable load, but RHS is not, flip the condition.
4548 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4549 !ISD::isNON_EXTLoad(RHS.getNode())) {
4550 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4551 std::swap(LHS, RHS);
4554 switch (SetCCOpcode) {
4560 std::swap(LHS, RHS);
4564 // On a floating point condition, the flags are set as follows:
4566 // 0 | 0 | 0 | X > Y
4567 // 0 | 0 | 1 | X < Y
4568 // 1 | 0 | 0 | X == Y
4569 // 1 | 1 | 1 | unordered
4570 switch (SetCCOpcode) {
4571 default: llvm_unreachable("Condcode should be pre-legalized away");
4573 case ISD::SETEQ: return X86::COND_E;
4574 case ISD::SETOLT: // flipped
4576 case ISD::SETGT: return X86::COND_A;
4577 case ISD::SETOLE: // flipped
4579 case ISD::SETGE: return X86::COND_AE;
4580 case ISD::SETUGT: // flipped
4582 case ISD::SETLT: return X86::COND_B;
4583 case ISD::SETUGE: // flipped
4585 case ISD::SETLE: return X86::COND_BE;
4587 case ISD::SETNE: return X86::COND_NE;
4588 case ISD::SETUO: return X86::COND_P;
4589 case ISD::SETO: return X86::COND_NP;
4591 case ISD::SETUNE: return X86::COND_INVALID;
4595 /// Is there a floating point cmov for the specific X86 condition code?
4596 /// Current x86 isa includes the following FP cmov instructions:
4597 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4598 static bool hasFPCMov(unsigned X86CC) {
4615 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4617 MachineFunction &MF,
4618 unsigned Intrinsic) const {
4620 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4624 Info.opc = ISD::INTRINSIC_W_CHAIN;
4625 Info.flags = MachineMemOperand::MONone;
4628 switch (IntrData->Type) {
4629 case TRUNCATE_TO_MEM_VI8:
4630 case TRUNCATE_TO_MEM_VI16:
4631 case TRUNCATE_TO_MEM_VI32: {
4632 Info.ptrVal = I.getArgOperand(0);
4633 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4634 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4635 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4637 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4638 ScalarVT = MVT::i16;
4639 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4640 ScalarVT = MVT::i32;
4642 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4644 Info.flags |= MachineMemOperand::MOStore;
4654 /// Returns true if the target can instruction select the
4655 /// specified FP immediate natively. If false, the legalizer will
4656 /// materialize the FP immediate as a load from a constant pool.
4657 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4658 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4659 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4665 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4666 ISD::LoadExtType ExtTy,
4668 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4669 // relocation target a movq or addq instruction: don't let the load shrink.
4670 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4671 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4672 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4673 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4677 /// Returns true if it is beneficial to convert a load of a constant
4678 /// to just the constant itself.
4679 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4681 assert(Ty->isIntegerTy());
4683 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4684 if (BitSize == 0 || BitSize > 64)
4689 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4690 // TODO: It might be a win to ease or lift this restriction, but the generic
4691 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4692 if (VT.isVector() && Subtarget.hasAVX512())
4698 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4699 unsigned Index) const {
4700 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4703 // Mask vectors support all subregister combinations and operations that
4704 // extract half of vector.
4705 if (ResVT.getVectorElementType() == MVT::i1)
4706 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4707 (Index == ResVT.getVectorNumElements()));
4709 return (Index % ResVT.getVectorNumElements()) == 0;
4712 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4713 // Speculate cttz only if we can directly use TZCNT.
4714 return Subtarget.hasBMI();
4717 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4718 // Speculate ctlz only if we can directly use LZCNT.
4719 return Subtarget.hasLZCNT();
4722 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4723 EVT BitcastVT) const {
4724 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4727 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4730 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4731 const SelectionDAG &DAG) const {
4732 // Do not merge to float value size (128 bytes) if no implicit
4733 // float attribute is set.
4734 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4735 Attribute::NoImplicitFloat);
4738 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4739 return (MemVT.getSizeInBits() <= MaxIntSize);
4744 bool X86TargetLowering::isCtlzFast() const {
4745 return Subtarget.hasFastLZCNT();
4748 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4749 const Instruction &AndI) const {
4753 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4754 EVT VT = Y.getValueType();
4759 if (!Subtarget.hasBMI())
4762 // There are only 32-bit and 64-bit forms for 'andn'.
4763 if (VT != MVT::i32 && VT != MVT::i64)
4766 // A mask and compare against constant is ok for an 'andn' too
4767 // even though the BMI instruction doesn't have an immediate form.
4772 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4773 EVT VT = Y.getValueType();
4775 if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
4776 return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
4780 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4783 if (VT == MVT::v4i32)
4786 return Subtarget.hasSSE2();
4789 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
4790 EVT VT = Y.getValueType();
4792 // For vectors, we don't have a preference, but we probably want a mask.
4796 // 64-bit shifts on 32-bit targets produce really bad bloated code.
4797 if (VT == MVT::i64 && !Subtarget.is64Bit())
4803 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4804 MVT VT = MVT::getIntegerVT(NumBits);
4805 if (isTypeLegal(VT))
4808 // PMOVMSKB can handle this.
4809 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4812 // VPMOVMSKB can handle this.
4813 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4816 // TODO: Allow 64-bit type for 32-bit target.
4817 // TODO: 512-bit types should be allowed, but make sure that those
4818 // cases are handled in combineVectorSizedSetCCEquality().
4820 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4823 /// Val is the undef sentinel value or equal to the specified value.
4824 static bool isUndefOrEqual(int Val, int CmpVal) {
4825 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4828 /// Val is either the undef or zero sentinel value.
4829 static bool isUndefOrZero(int Val) {
4830 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4833 /// Return true if every element in Mask, beginning
4834 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4835 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4836 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4837 if (Mask[i] != SM_SentinelUndef)
4842 /// Return true if Val falls within the specified range (L, H].
4843 static bool isInRange(int Val, int Low, int Hi) {
4844 return (Val >= Low && Val < Hi);
4847 /// Return true if the value of any element in Mask falls within the specified
4849 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
4851 if (isInRange(M, Low, Hi))
4856 /// Return true if Val is undef or if its value falls within the
4857 /// specified range (L, H].
4858 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4859 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
4862 /// Return true if every element in Mask is undef or if its value
4863 /// falls within the specified range (L, H].
4864 static bool isUndefOrInRange(ArrayRef<int> Mask,
4867 if (!isUndefOrInRange(M, Low, Hi))
4872 /// Return true if Val is undef, zero or if its value falls within the
4873 /// specified range (L, H].
4874 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4875 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
4878 /// Return true if every element in Mask is undef, zero or if its value
4879 /// falls within the specified range (L, H].
4880 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4882 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4887 /// Return true if every element in Mask, beginning
4888 /// from position Pos and ending in Pos + Size, falls within the specified
4889 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4890 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4891 unsigned Size, int Low, int Step = 1) {
4892 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4893 if (!isUndefOrEqual(Mask[i], Low))
4898 /// Return true if every element in Mask, beginning
4899 /// from position Pos and ending in Pos+Size, falls within the specified
4900 /// sequential range (Low, Low+Size], or is undef or is zero.
4901 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4902 unsigned Size, int Low) {
4903 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4904 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4909 /// Return true if every element in Mask, beginning
4910 /// from position Pos and ending in Pos+Size is undef or is zero.
4911 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4913 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4914 if (!isUndefOrZero(Mask[i]))
4919 /// Helper function to test whether a shuffle mask could be
4920 /// simplified by widening the elements being shuffled.
4922 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4923 /// leaves it in an unspecified state.
4925 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4926 /// shuffle masks. The latter have the special property of a '-2' representing
4927 /// a zero-ed lane of a vector.
4928 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4929 SmallVectorImpl<int> &WidenedMask) {
4930 WidenedMask.assign(Mask.size() / 2, 0);
4931 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4933 int M1 = Mask[i + 1];
4935 // If both elements are undef, its trivial.
4936 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4937 WidenedMask[i / 2] = SM_SentinelUndef;
4941 // Check for an undef mask and a mask value properly aligned to fit with
4942 // a pair of values. If we find such a case, use the non-undef mask's value.
4943 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4944 WidenedMask[i / 2] = M1 / 2;
4947 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4948 WidenedMask[i / 2] = M0 / 2;
4952 // When zeroing, we need to spread the zeroing across both lanes to widen.
4953 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4954 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4955 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4956 WidenedMask[i / 2] = SM_SentinelZero;
4962 // Finally check if the two mask values are adjacent and aligned with
4964 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4965 WidenedMask[i / 2] = M0 / 2;
4969 // Otherwise we can't safely widen the elements used in this shuffle.
4972 assert(WidenedMask.size() == Mask.size() / 2 &&
4973 "Incorrect size of mask after widening the elements!");
4978 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4979 const APInt &Zeroable,
4980 SmallVectorImpl<int> &WidenedMask) {
4981 SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
4982 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
4983 if (TargetMask[i] == SM_SentinelUndef)
4986 TargetMask[i] = SM_SentinelZero;
4988 return canWidenShuffleElements(TargetMask, WidenedMask);
4991 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
4992 SmallVector<int, 32> WidenedMask;
4993 return canWidenShuffleElements(Mask, WidenedMask);
4996 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4997 bool X86::isZeroNode(SDValue Elt) {
4998 return isNullConstant(Elt) || isNullFPConstant(Elt);
5001 // Build a vector of constants.
5002 // Use an UNDEF node if MaskElt == -1.
5003 // Split 64-bit constants in the 32-bit mode.
5004 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5005 const SDLoc &dl, bool IsMask = false) {
5007 SmallVector<SDValue, 32> Ops;
5010 MVT ConstVecVT = VT;
5011 unsigned NumElts = VT.getVectorNumElements();
5012 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5013 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5014 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5018 MVT EltVT = ConstVecVT.getVectorElementType();
5019 for (unsigned i = 0; i < NumElts; ++i) {
5020 bool IsUndef = Values[i] < 0 && IsMask;
5021 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5022 DAG.getConstant(Values[i], dl, EltVT);
5023 Ops.push_back(OpNode);
5025 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5026 DAG.getConstant(0, dl, EltVT));
5028 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5030 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5034 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5035 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5036 assert(Bits.size() == Undefs.getBitWidth() &&
5037 "Unequal constant and undef arrays");
5038 SmallVector<SDValue, 32> Ops;
5041 MVT ConstVecVT = VT;
5042 unsigned NumElts = VT.getVectorNumElements();
5043 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5044 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5045 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5049 MVT EltVT = ConstVecVT.getVectorElementType();
5050 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5052 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5055 const APInt &V = Bits[i];
5056 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5058 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5059 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5060 } else if (EltVT == MVT::f32) {
5061 APFloat FV(APFloat::IEEEsingle(), V);
5062 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5063 } else if (EltVT == MVT::f64) {
5064 APFloat FV(APFloat::IEEEdouble(), V);
5065 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5067 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5071 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5072 return DAG.getBitcast(VT, ConstsNode);
5075 /// Returns a vector of specified type with all zero elements.
5076 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5077 SelectionDAG &DAG, const SDLoc &dl) {
5078 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5079 VT.getVectorElementType() == MVT::i1) &&
5080 "Unexpected vector type");
5082 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5083 // type. This ensures they get CSE'd. But if the integer type is not
5084 // available, use a floating-point +0.0 instead.
5086 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5087 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5088 } else if (VT.getVectorElementType() == MVT::i1) {
5089 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5090 "Unexpected vector type");
5091 Vec = DAG.getConstant(0, dl, VT);
5093 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5094 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5096 return DAG.getBitcast(VT, Vec);
5099 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5100 const SDLoc &dl, unsigned vectorWidth) {
5101 EVT VT = Vec.getValueType();
5102 EVT ElVT = VT.getVectorElementType();
5103 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5104 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5105 VT.getVectorNumElements()/Factor);
5107 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5108 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5109 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5111 // This is the index of the first element of the vectorWidth-bit chunk
5112 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5113 IdxVal &= ~(ElemsPerChunk - 1);
5115 // If the input is a buildvector just emit a smaller one.
5116 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5117 return DAG.getBuildVector(ResultVT, dl,
5118 Vec->ops().slice(IdxVal, ElemsPerChunk));
5120 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5121 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5124 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5125 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5126 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5127 /// instructions or a simple subregister reference. Idx is an index in the
5128 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5129 /// lowering EXTRACT_VECTOR_ELT operations easier.
5130 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5131 SelectionDAG &DAG, const SDLoc &dl) {
5132 assert((Vec.getValueType().is256BitVector() ||
5133 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5134 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5137 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5138 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5139 SelectionDAG &DAG, const SDLoc &dl) {
5140 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5141 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5144 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5145 SelectionDAG &DAG, const SDLoc &dl,
5146 unsigned vectorWidth) {
5147 assert((vectorWidth == 128 || vectorWidth == 256) &&
5148 "Unsupported vector width");
5149 // Inserting UNDEF is Result
5152 EVT VT = Vec.getValueType();
5153 EVT ElVT = VT.getVectorElementType();
5154 EVT ResultVT = Result.getValueType();
5156 // Insert the relevant vectorWidth bits.
5157 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5158 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5160 // This is the index of the first element of the vectorWidth-bit chunk
5161 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5162 IdxVal &= ~(ElemsPerChunk - 1);
5164 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5165 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5168 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5169 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5170 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5171 /// simple superregister reference. Idx is an index in the 128 bits
5172 /// we want. It need not be aligned to a 128-bit boundary. That makes
5173 /// lowering INSERT_VECTOR_ELT operations easier.
5174 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5175 SelectionDAG &DAG, const SDLoc &dl) {
5176 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5177 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5180 /// Widen a vector to a larger size with the same scalar type, with the new
5181 /// elements either zero or undef.
5182 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5183 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5185 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5186 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5187 "Unsupported vector widening type");
5188 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5190 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5191 DAG.getIntPtrConstant(0, dl));
5194 // Helper for splitting operands of an operation to legal target size and
5195 // apply a function on each part.
5196 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5197 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5198 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5199 // The argument Builder is a function that will be applied on each split part:
5200 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5201 template <typename F>
5202 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5203 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5204 F Builder, bool CheckBWI = true) {
5205 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5206 unsigned NumSubs = 1;
5207 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5208 (!CheckBWI && Subtarget.useAVX512Regs())) {
5209 if (VT.getSizeInBits() > 512) {
5210 NumSubs = VT.getSizeInBits() / 512;
5211 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5213 } else if (Subtarget.hasAVX2()) {
5214 if (VT.getSizeInBits() > 256) {
5215 NumSubs = VT.getSizeInBits() / 256;
5216 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5219 if (VT.getSizeInBits() > 128) {
5220 NumSubs = VT.getSizeInBits() / 128;
5221 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5226 return Builder(DAG, DL, Ops);
5228 SmallVector<SDValue, 4> Subs;
5229 for (unsigned i = 0; i != NumSubs; ++i) {
5230 SmallVector<SDValue, 2> SubOps;
5231 for (SDValue Op : Ops) {
5232 EVT OpVT = Op.getValueType();
5233 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5234 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5235 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5237 Subs.push_back(Builder(DAG, DL, SubOps));
5239 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5242 // Return true if the instruction zeroes the unused upper part of the
5243 // destination and accepts mask.
5244 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5249 case X86ISD::CMPM_RND:
5255 /// Insert i1-subvector to i1-vector.
5256 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5257 const X86Subtarget &Subtarget) {
5260 SDValue Vec = Op.getOperand(0);
5261 SDValue SubVec = Op.getOperand(1);
5262 SDValue Idx = Op.getOperand(2);
5264 if (!isa<ConstantSDNode>(Idx))
5267 // Inserting undef is a nop. We can just return the original vector.
5268 if (SubVec.isUndef())
5271 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5272 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5275 MVT OpVT = Op.getSimpleValueType();
5276 unsigned NumElems = OpVT.getVectorNumElements();
5278 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5280 // Extend to natively supported kshift.
5281 MVT WideOpVT = OpVT;
5282 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5283 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5285 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5287 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5288 // May need to promote to a legal type.
5289 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5290 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5292 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5295 MVT SubVecVT = SubVec.getSimpleValueType();
5296 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5298 assert(IdxVal + SubVecNumElems <= NumElems &&
5299 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5300 "Unexpected index value in INSERT_SUBVECTOR");
5302 SDValue Undef = DAG.getUNDEF(WideOpVT);
5305 // Zero lower bits of the Vec
5306 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5307 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5309 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5310 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5311 // Merge them together, SubVec should be zero extended.
5312 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5313 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5315 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5316 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5319 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5320 Undef, SubVec, ZeroIdx);
5322 if (Vec.isUndef()) {
5323 assert(IdxVal != 0 && "Unexpected index");
5324 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5325 DAG.getConstant(IdxVal, dl, MVT::i8));
5326 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5329 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5330 assert(IdxVal != 0 && "Unexpected index");
5331 NumElems = WideOpVT.getVectorNumElements();
5332 unsigned ShiftLeft = NumElems - SubVecNumElems;
5333 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5334 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5335 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5336 if (ShiftRight != 0)
5337 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5338 DAG.getConstant(ShiftRight, dl, MVT::i8));
5339 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5342 // Simple case when we put subvector in the upper part
5343 if (IdxVal + SubVecNumElems == NumElems) {
5344 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5345 DAG.getConstant(IdxVal, dl, MVT::i8));
5346 if (SubVecNumElems * 2 == NumElems) {
5347 // Special case, use legal zero extending insert_subvector. This allows
5348 // isel to opimitize when bits are known zero.
5349 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5350 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5351 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5354 // Otherwise use explicit shifts to zero the bits.
5355 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5356 Undef, Vec, ZeroIdx);
5357 NumElems = WideOpVT.getVectorNumElements();
5358 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5359 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5360 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5362 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5363 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5366 // Inserting into the middle is more complicated.
5368 NumElems = WideOpVT.getVectorNumElements();
5370 // Widen the vector if needed.
5371 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5372 // Move the current value of the bit to be replace to the lsbs.
5373 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5374 DAG.getConstant(IdxVal, dl, MVT::i8));
5375 // Xor with the new bit.
5376 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5377 // Shift to MSB, filling bottom bits with 0.
5378 unsigned ShiftLeft = NumElems - SubVecNumElems;
5379 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5380 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5381 // Shift to the final position, filling upper bits with 0.
5382 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5383 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5384 DAG.getConstant(ShiftRight, dl, MVT::i8));
5385 // Xor with original vector leaving the new value.
5386 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5387 // Reduce to original width if needed.
5388 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5391 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5392 unsigned NumElems, SelectionDAG &DAG,
5393 const SDLoc &dl, unsigned VectorWidth) {
5394 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5395 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5398 /// Returns a vector of specified type with all bits set.
5399 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5400 /// Then bitcast to their original type, ensuring they get CSE'd.
5401 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5402 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5403 "Expected a 128/256/512-bit vector type");
5405 APInt Ones = APInt::getAllOnesValue(32);
5406 unsigned NumElts = VT.getSizeInBits() / 32;
5407 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5408 return DAG.getBitcast(VT, Vec);
5411 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5412 SelectionDAG &DAG) {
5413 EVT InVT = In.getValueType();
5414 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5416 if (VT.is128BitVector() && InVT.is128BitVector())
5417 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5418 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5420 // For 256-bit vectors, we only need the lower (128-bit) input half.
5421 // For 512-bit vectors, we only need the lower input half or quarter.
5422 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5423 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5424 In = extractSubVector(In, 0, DAG, DL,
5425 std::max(128, (int)VT.getSizeInBits() / Scale));
5428 return DAG.getNode(Opc, DL, VT, In);
5431 /// Returns a vector_shuffle node for an unpackl operation.
5432 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5433 SDValue V1, SDValue V2) {
5434 SmallVector<int, 8> Mask;
5435 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5436 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5439 /// Returns a vector_shuffle node for an unpackh operation.
5440 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5441 SDValue V1, SDValue V2) {
5442 SmallVector<int, 8> Mask;
5443 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5444 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5447 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5448 /// This produces a shuffle where the low element of V2 is swizzled into the
5449 /// zero/undef vector, landing at element Idx.
5450 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5451 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5453 const X86Subtarget &Subtarget,
5454 SelectionDAG &DAG) {
5455 MVT VT = V2.getSimpleValueType();
5457 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5458 int NumElems = VT.getVectorNumElements();
5459 SmallVector<int, 16> MaskVec(NumElems);
5460 for (int i = 0; i != NumElems; ++i)
5461 // If this is the insertion idx, put the low elt of V2 here.
5462 MaskVec[i] = (i == Idx) ? NumElems : i;
5463 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5466 static SDValue peekThroughBitcasts(SDValue V) {
5467 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5468 V = V.getOperand(0);
5472 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5473 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5474 V.getOperand(0).hasOneUse())
5475 V = V.getOperand(0);
5479 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5480 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5481 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5482 V = V.getOperand(0);
5486 static const Constant *getTargetConstantFromNode(SDValue Op) {
5487 Op = peekThroughBitcasts(Op);
5489 auto *Load = dyn_cast<LoadSDNode>(Op);
5493 SDValue Ptr = Load->getBasePtr();
5494 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5495 Ptr->getOpcode() == X86ISD::WrapperRIP)
5496 Ptr = Ptr->getOperand(0);
5498 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5499 if (!CNode || CNode->isMachineConstantPoolEntry())
5502 return dyn_cast<Constant>(CNode->getConstVal());
5505 // Extract raw constant bits from constant pools.
5506 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5508 SmallVectorImpl<APInt> &EltBits,
5509 bool AllowWholeUndefs = true,
5510 bool AllowPartialUndefs = true) {
5511 assert(EltBits.empty() && "Expected an empty EltBits vector");
5513 Op = peekThroughBitcasts(Op);
5515 EVT VT = Op.getValueType();
5516 unsigned SizeInBits = VT.getSizeInBits();
5517 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5518 unsigned NumElts = SizeInBits / EltSizeInBits;
5520 // Bitcast a source array of element bits to the target size.
5521 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5522 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5523 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5524 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5525 "Constant bit sizes don't match");
5527 // Don't split if we don't allow undef bits.
5528 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5529 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5532 // If we're already the right size, don't bother bitcasting.
5533 if (NumSrcElts == NumElts) {
5534 UndefElts = UndefSrcElts;
5535 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5539 // Extract all the undef/constant element data and pack into single bitsets.
5540 APInt UndefBits(SizeInBits, 0);
5541 APInt MaskBits(SizeInBits, 0);
5543 for (unsigned i = 0; i != NumSrcElts; ++i) {
5544 unsigned BitOffset = i * SrcEltSizeInBits;
5545 if (UndefSrcElts[i])
5546 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5547 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5550 // Split the undef/constant single bitset data into the target elements.
5551 UndefElts = APInt(NumElts, 0);
5552 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5554 for (unsigned i = 0; i != NumElts; ++i) {
5555 unsigned BitOffset = i * EltSizeInBits;
5556 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5558 // Only treat an element as UNDEF if all bits are UNDEF.
5559 if (UndefEltBits.isAllOnesValue()) {
5560 if (!AllowWholeUndefs)
5562 UndefElts.setBit(i);
5566 // If only some bits are UNDEF then treat them as zero (or bail if not
5568 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5571 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5572 EltBits[i] = Bits.getZExtValue();
5577 // Collect constant bits and insert into mask/undef bit masks.
5578 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5579 unsigned UndefBitIndex) {
5582 if (isa<UndefValue>(Cst)) {
5583 Undefs.setBit(UndefBitIndex);
5586 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5587 Mask = CInt->getValue();
5590 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5591 Mask = CFP->getValueAPF().bitcastToAPInt();
5599 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5600 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5601 return CastBitData(UndefSrcElts, SrcEltBits);
5604 // Extract scalar constant bits.
5605 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5606 APInt UndefSrcElts = APInt::getNullValue(1);
5607 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5608 return CastBitData(UndefSrcElts, SrcEltBits);
5610 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5611 APInt UndefSrcElts = APInt::getNullValue(1);
5612 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5613 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5614 return CastBitData(UndefSrcElts, SrcEltBits);
5617 // Extract constant bits from build vector.
5618 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5619 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5620 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5622 APInt UndefSrcElts(NumSrcElts, 0);
5623 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5624 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5625 const SDValue &Src = Op.getOperand(i);
5626 if (Src.isUndef()) {
5627 UndefSrcElts.setBit(i);
5630 auto *Cst = cast<ConstantSDNode>(Src);
5631 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5633 return CastBitData(UndefSrcElts, SrcEltBits);
5636 // Extract constant bits from constant pool vector.
5637 if (auto *Cst = getTargetConstantFromNode(Op)) {
5638 Type *CstTy = Cst->getType();
5639 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5642 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5643 unsigned NumSrcElts = CstTy->getVectorNumElements();
5645 APInt UndefSrcElts(NumSrcElts, 0);
5646 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5647 for (unsigned i = 0; i != NumSrcElts; ++i)
5648 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5652 return CastBitData(UndefSrcElts, SrcEltBits);
5655 // Extract constant bits from a broadcasted constant pool scalar.
5656 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5657 EltSizeInBits <= VT.getScalarSizeInBits()) {
5658 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5659 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5660 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5662 APInt UndefSrcElts(NumSrcElts, 0);
5663 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5664 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5665 if (UndefSrcElts[0])
5666 UndefSrcElts.setBits(0, NumSrcElts);
5667 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5668 return CastBitData(UndefSrcElts, SrcEltBits);
5673 // Extract a rematerialized scalar constant insertion.
5674 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5675 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5676 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5677 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5678 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5680 APInt UndefSrcElts(NumSrcElts, 0);
5681 SmallVector<APInt, 64> SrcEltBits;
5682 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5683 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5684 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5685 return CastBitData(UndefSrcElts, SrcEltBits);
5691 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5692 unsigned MaskEltSizeInBits,
5693 SmallVectorImpl<uint64_t> &RawMask) {
5695 SmallVector<APInt, 64> EltBits;
5697 // Extract the raw target constant bits.
5698 // FIXME: We currently don't support UNDEF bits or mask entries.
5699 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5700 EltBits, /* AllowWholeUndefs */ false,
5701 /* AllowPartialUndefs */ false))
5704 // Insert the extracted elements into the mask.
5705 for (APInt Elt : EltBits)
5706 RawMask.push_back(Elt.getZExtValue());
5711 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5712 /// Note: This ignores saturation, so inputs must be checked first.
5713 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5715 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5716 unsigned NumElts = VT.getVectorNumElements();
5717 unsigned NumLanes = VT.getSizeInBits() / 128;
5718 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5719 unsigned Offset = Unary ? 0 : NumElts;
5721 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5722 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5723 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5724 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5725 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5729 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5730 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5731 /// operands in \p Ops, and returns true.
5732 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5733 /// IsUnary for shuffles which use a single input multiple times, and in those
5734 /// cases it will adjust the mask to only have indices within that single input.
5735 /// It is an error to call this with non-empty Mask/Ops vectors.
5736 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5737 SmallVectorImpl<SDValue> &Ops,
5738 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5739 unsigned NumElems = VT.getVectorNumElements();
5742 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5743 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5746 bool IsFakeUnary = false;
5747 switch(N->getOpcode()) {
5748 case X86ISD::BLENDI:
5749 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5750 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5751 ImmN = N->getOperand(N->getNumOperands()-1);
5752 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5753 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5756 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5757 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5758 ImmN = N->getOperand(N->getNumOperands()-1);
5759 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5760 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5761 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5763 case X86ISD::INSERTPS:
5764 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5765 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5766 ImmN = N->getOperand(N->getNumOperands()-1);
5767 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5768 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5770 case X86ISD::EXTRQI:
5771 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5772 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5773 isa<ConstantSDNode>(N->getOperand(2))) {
5774 int BitLen = N->getConstantOperandVal(1);
5775 int BitIdx = N->getConstantOperandVal(2);
5776 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5781 case X86ISD::INSERTQI:
5782 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5783 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5784 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5785 isa<ConstantSDNode>(N->getOperand(3))) {
5786 int BitLen = N->getConstantOperandVal(2);
5787 int BitIdx = N->getConstantOperandVal(3);
5788 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5790 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5793 case X86ISD::UNPCKH:
5794 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5795 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5796 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5797 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5799 case X86ISD::UNPCKL:
5800 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5801 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5802 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5803 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5805 case X86ISD::MOVHLPS:
5806 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5807 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5808 DecodeMOVHLPSMask(NumElems, Mask);
5809 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5811 case X86ISD::MOVLHPS:
5812 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5813 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5814 DecodeMOVLHPSMask(NumElems, Mask);
5815 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5817 case X86ISD::PALIGNR:
5818 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5819 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5820 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5821 ImmN = N->getOperand(N->getNumOperands()-1);
5822 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5824 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5825 Ops.push_back(N->getOperand(1));
5826 Ops.push_back(N->getOperand(0));
5828 case X86ISD::VSHLDQ:
5829 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5830 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5831 ImmN = N->getOperand(N->getNumOperands() - 1);
5832 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5836 case X86ISD::VSRLDQ:
5837 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5838 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5839 ImmN = N->getOperand(N->getNumOperands() - 1);
5840 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5844 case X86ISD::PSHUFD:
5845 case X86ISD::VPERMILPI:
5846 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5847 ImmN = N->getOperand(N->getNumOperands()-1);
5848 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5849 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5852 case X86ISD::PSHUFHW:
5853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5854 ImmN = N->getOperand(N->getNumOperands()-1);
5855 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5859 case X86ISD::PSHUFLW:
5860 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5861 ImmN = N->getOperand(N->getNumOperands()-1);
5862 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5866 case X86ISD::VZEXT_MOVL:
5867 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5868 DecodeZeroMoveLowMask(NumElems, Mask);
5871 case X86ISD::VBROADCAST: {
5872 SDValue N0 = N->getOperand(0);
5873 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5874 // add the pre-extracted value to the Ops vector.
5875 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5876 N0.getOperand(0).getValueType() == VT &&
5877 N0.getConstantOperandVal(1) == 0)
5878 Ops.push_back(N0.getOperand(0));
5880 // We only decode broadcasts of same-sized vectors, unless the broadcast
5881 // came from an extract from the original width. If we found one, we
5882 // pushed it the Ops vector above.
5883 if (N0.getValueType() == VT || !Ops.empty()) {
5884 DecodeVectorBroadcast(NumElems, Mask);
5890 case X86ISD::VPERMILPV: {
5891 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5893 SDValue MaskNode = N->getOperand(1);
5894 unsigned MaskEltSize = VT.getScalarSizeInBits();
5895 SmallVector<uint64_t, 32> RawMask;
5896 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5897 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5900 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5901 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5906 case X86ISD::PSHUFB: {
5907 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5908 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5909 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5911 SDValue MaskNode = N->getOperand(1);
5912 SmallVector<uint64_t, 32> RawMask;
5913 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5914 DecodePSHUFBMask(RawMask, Mask);
5917 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5918 DecodePSHUFBMask(C, Mask);
5923 case X86ISD::VPERMI:
5924 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5925 ImmN = N->getOperand(N->getNumOperands()-1);
5926 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5931 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5932 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5933 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5935 case X86ISD::VPERM2X128:
5936 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5937 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5938 ImmN = N->getOperand(N->getNumOperands()-1);
5939 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5941 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5943 case X86ISD::SHUF128:
5944 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5945 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5946 ImmN = N->getOperand(N->getNumOperands()-1);
5947 decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
5948 cast<ConstantSDNode>(ImmN)->getZExtValue(),
5950 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5952 case X86ISD::MOVSLDUP:
5953 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5954 DecodeMOVSLDUPMask(NumElems, Mask);
5957 case X86ISD::MOVSHDUP:
5958 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5959 DecodeMOVSHDUPMask(NumElems, Mask);
5962 case X86ISD::MOVDDUP:
5963 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5964 DecodeMOVDDUPMask(NumElems, Mask);
5967 case X86ISD::VPERMIL2: {
5968 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5969 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5970 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5971 unsigned MaskEltSize = VT.getScalarSizeInBits();
5972 SDValue MaskNode = N->getOperand(2);
5973 SDValue CtrlNode = N->getOperand(3);
5974 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5975 unsigned CtrlImm = CtrlOp->getZExtValue();
5976 SmallVector<uint64_t, 32> RawMask;
5977 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5978 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5982 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5983 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5989 case X86ISD::VPPERM: {
5990 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5992 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5993 SDValue MaskNode = N->getOperand(2);
5994 SmallVector<uint64_t, 32> RawMask;
5995 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5996 DecodeVPPERMMask(RawMask, Mask);
5999 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6000 DecodeVPPERMMask(C, Mask);
6005 case X86ISD::VPERMV: {
6006 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6008 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6009 Ops.push_back(N->getOperand(1));
6010 SDValue MaskNode = N->getOperand(0);
6011 SmallVector<uint64_t, 32> RawMask;
6012 unsigned MaskEltSize = VT.getScalarSizeInBits();
6013 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
6014 DecodeVPERMVMask(RawMask, Mask);
6017 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6018 DecodeVPERMVMask(C, MaskEltSize, Mask);
6023 case X86ISD::VPERMV3: {
6024 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6025 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
6026 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6027 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6028 Ops.push_back(N->getOperand(0));
6029 Ops.push_back(N->getOperand(2));
6030 SDValue MaskNode = N->getOperand(1);
6031 unsigned MaskEltSize = VT.getScalarSizeInBits();
6032 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6033 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
6038 default: llvm_unreachable("unknown target shuffle node");
6041 // Empty mask indicates the decode failed.
6045 // Check if we're getting a shuffle mask with zero'd elements.
6046 if (!AllowSentinelZero)
6047 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6050 // If we have a fake unary shuffle, the shuffle mask is spread across two
6051 // inputs that are actually the same node. Re-map the mask to always point
6052 // into the first input.
6055 if (M >= (int)Mask.size())
6058 // If we didn't already add operands in the opcode-specific code, default to
6059 // adding 1 or 2 operands starting at 0.
6061 Ops.push_back(N->getOperand(0));
6062 if (!IsUnary || IsFakeUnary)
6063 Ops.push_back(N->getOperand(1));
6069 /// Check a target shuffle mask's inputs to see if we can set any values to
6070 /// SM_SentinelZero - this is for elements that are known to be zero
6071 /// (not just zeroable) from their inputs.
6072 /// Returns true if the target shuffle mask was decoded.
6073 static bool setTargetShuffleZeroElements(SDValue N,
6074 SmallVectorImpl<int> &Mask,
6075 SmallVectorImpl<SDValue> &Ops) {
6077 if (!isTargetShuffle(N.getOpcode()))
6080 MVT VT = N.getSimpleValueType();
6081 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6084 SDValue V1 = Ops[0];
6085 SDValue V2 = IsUnary ? V1 : Ops[1];
6087 V1 = peekThroughBitcasts(V1);
6088 V2 = peekThroughBitcasts(V2);
6090 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6091 "Illegal split of shuffle value type");
6092 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6094 // Extract known constant input data.
6095 APInt UndefSrcElts[2];
6096 SmallVector<APInt, 32> SrcEltBits[2];
6097 bool IsSrcConstant[2] = {
6098 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6099 SrcEltBits[0], true, false),
6100 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6101 SrcEltBits[1], true, false)};
6103 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6106 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6110 // Determine shuffle input and normalize the mask.
6111 unsigned SrcIdx = M / Size;
6112 SDValue V = M < Size ? V1 : V2;
6115 // We are referencing an UNDEF input.
6117 Mask[i] = SM_SentinelUndef;
6121 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6122 // TODO: We currently only set UNDEF for integer types - floats use the same
6123 // registers as vectors and many of the scalar folded loads rely on the
6124 // SCALAR_TO_VECTOR pattern.
6125 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6126 (Size % V.getValueType().getVectorNumElements()) == 0) {
6127 int Scale = Size / V.getValueType().getVectorNumElements();
6128 int Idx = M / Scale;
6129 if (Idx != 0 && !VT.isFloatingPoint())
6130 Mask[i] = SM_SentinelUndef;
6131 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6132 Mask[i] = SM_SentinelZero;
6136 // Attempt to extract from the source's constant bits.
6137 if (IsSrcConstant[SrcIdx]) {
6138 if (UndefSrcElts[SrcIdx][M])
6139 Mask[i] = SM_SentinelUndef;
6140 else if (SrcEltBits[SrcIdx][M] == 0)
6141 Mask[i] = SM_SentinelZero;
6145 assert(VT.getVectorNumElements() == Mask.size() &&
6146 "Different mask size from vector size!");
6150 // Attempt to decode ops that could be represented as a shuffle mask.
6151 // The decoded shuffle mask may contain a different number of elements to the
6152 // destination value type.
6153 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6154 SmallVectorImpl<SDValue> &Ops,
6155 const SelectionDAG &DAG) {
6159 MVT VT = N.getSimpleValueType();
6160 unsigned NumElts = VT.getVectorNumElements();
6161 unsigned NumSizeInBits = VT.getSizeInBits();
6162 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6163 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6164 "Expected byte aligned value types");
6166 unsigned Opcode = N.getOpcode();
6168 case ISD::VECTOR_SHUFFLE: {
6169 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6170 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6171 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6172 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6173 Ops.push_back(N.getOperand(0));
6174 Ops.push_back(N.getOperand(1));
6180 case X86ISD::ANDNP: {
6181 // Attempt to decode as a per-byte mask.
6183 SmallVector<APInt, 32> EltBits;
6184 SDValue N0 = N.getOperand(0);
6185 SDValue N1 = N.getOperand(1);
6186 bool IsAndN = (X86ISD::ANDNP == Opcode);
6187 uint64_t ZeroMask = IsAndN ? 255 : 0;
6188 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6190 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6192 Mask.push_back(SM_SentinelUndef);
6195 uint64_t ByteBits = EltBits[i].getZExtValue();
6196 if (ByteBits != 0 && ByteBits != 255)
6198 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6200 Ops.push_back(IsAndN ? N1 : N0);
6203 case ISD::SCALAR_TO_VECTOR: {
6204 // Match against a scalar_to_vector of an extract from a vector,
6205 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6206 SDValue N0 = N.getOperand(0);
6209 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6210 N0.getOperand(0).getValueType() == VT) ||
6211 (N0.getOpcode() == X86ISD::PEXTRW &&
6212 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6213 (N0.getOpcode() == X86ISD::PEXTRB &&
6214 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6218 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6221 SDValue SrcVec = SrcExtract.getOperand(0);
6222 EVT SrcVT = SrcVec.getValueType();
6223 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6224 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6226 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6227 if (NumSrcElts <= SrcIdx)
6230 Ops.push_back(SrcVec);
6231 Mask.push_back(SrcIdx);
6232 Mask.append(NumZeros, SM_SentinelZero);
6233 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6236 case X86ISD::PINSRB:
6237 case X86ISD::PINSRW: {
6238 SDValue InVec = N.getOperand(0);
6239 SDValue InScl = N.getOperand(1);
6240 SDValue InIndex = N.getOperand(2);
6241 if (!isa<ConstantSDNode>(InIndex) ||
6242 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6244 uint64_t InIdx = N.getConstantOperandVal(2);
6246 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6247 if (X86::isZeroNode(InScl)) {
6248 Ops.push_back(InVec);
6249 for (unsigned i = 0; i != NumElts; ++i)
6250 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6254 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6255 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6257 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6258 if (InScl.getOpcode() != ExOp)
6261 SDValue ExVec = InScl.getOperand(0);
6262 SDValue ExIndex = InScl.getOperand(1);
6263 if (!isa<ConstantSDNode>(ExIndex) ||
6264 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6266 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6268 Ops.push_back(InVec);
6269 Ops.push_back(ExVec);
6270 for (unsigned i = 0; i != NumElts; ++i)
6271 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6274 case X86ISD::PACKSS:
6275 case X86ISD::PACKUS: {
6276 SDValue N0 = N.getOperand(0);
6277 SDValue N1 = N.getOperand(1);
6278 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6279 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6280 "Unexpected input value type");
6282 // If we know input saturation won't happen we can treat this
6283 // as a truncation shuffle.
6284 if (Opcode == X86ISD::PACKSS) {
6285 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6286 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6289 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6290 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6291 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6295 bool IsUnary = (N0 == N1);
6301 createPackShuffleMask(VT, Mask, IsUnary);
6305 case X86ISD::VSRLI: {
6306 uint64_t ShiftVal = N.getConstantOperandVal(1);
6307 // Out of range bit shifts are guaranteed to be zero.
6308 if (NumBitsPerElt <= ShiftVal) {
6309 Mask.append(NumElts, SM_SentinelZero);
6313 // We can only decode 'whole byte' bit shifts as shuffles.
6314 if ((ShiftVal % 8) != 0)
6317 uint64_t ByteShift = ShiftVal / 8;
6318 unsigned NumBytes = NumSizeInBits / 8;
6319 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6320 Ops.push_back(N.getOperand(0));
6322 // Clear mask to all zeros and insert the shifted byte indices.
6323 Mask.append(NumBytes, SM_SentinelZero);
6325 if (X86ISD::VSHLI == Opcode) {
6326 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6327 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6328 Mask[i + j] = i + j - ByteShift;
6330 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6331 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6332 Mask[i + j - ByteShift] = i + j;
6336 case ISD::ZERO_EXTEND_VECTOR_INREG:
6337 case X86ISD::VZEXT: {
6338 // TODO - add support for VPMOVZX with smaller input vector types.
6339 SDValue Src = N.getOperand(0);
6340 MVT SrcVT = Src.getSimpleValueType();
6341 if (NumSizeInBits != SrcVT.getSizeInBits())
6343 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6344 VT.getVectorNumElements(), Mask);
6353 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6354 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6355 SmallVectorImpl<int> &Mask) {
6356 int MaskWidth = Mask.size();
6357 SmallVector<SDValue, 16> UsedInputs;
6358 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6359 int lo = UsedInputs.size() * MaskWidth;
6360 int hi = lo + MaskWidth;
6362 // Strip UNDEF input usage.
6363 if (Inputs[i].isUndef())
6365 if ((lo <= M) && (M < hi))
6366 M = SM_SentinelUndef;
6368 // Check for unused inputs.
6369 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6370 UsedInputs.push_back(Inputs[i]);
6377 Inputs = UsedInputs;
6380 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6381 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6382 /// remaining input indices in case we now have a unary shuffle and adjust the
6383 /// inputs accordingly.
6384 /// Returns true if the target shuffle mask was decoded.
6385 static bool resolveTargetShuffleInputs(SDValue Op,
6386 SmallVectorImpl<SDValue> &Inputs,
6387 SmallVectorImpl<int> &Mask,
6388 const SelectionDAG &DAG) {
6389 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6390 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6393 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6397 /// Returns the scalar element that will make up the ith
6398 /// element of the result of the vector shuffle.
6399 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6402 return SDValue(); // Limit search depth.
6404 SDValue V = SDValue(N, 0);
6405 EVT VT = V.getValueType();
6406 unsigned Opcode = V.getOpcode();
6408 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6409 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6410 int Elt = SV->getMaskElt(Index);
6413 return DAG.getUNDEF(VT.getVectorElementType());
6415 unsigned NumElems = VT.getVectorNumElements();
6416 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6417 : SV->getOperand(1);
6418 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6421 // Recurse into target specific vector shuffles to find scalars.
6422 if (isTargetShuffle(Opcode)) {
6423 MVT ShufVT = V.getSimpleValueType();
6424 MVT ShufSVT = ShufVT.getVectorElementType();
6425 int NumElems = (int)ShufVT.getVectorNumElements();
6426 SmallVector<int, 16> ShuffleMask;
6427 SmallVector<SDValue, 16> ShuffleOps;
6430 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6433 int Elt = ShuffleMask[Index];
6434 if (Elt == SM_SentinelZero)
6435 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6436 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6437 if (Elt == SM_SentinelUndef)
6438 return DAG.getUNDEF(ShufSVT);
6440 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6441 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6442 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6446 // Actual nodes that may contain scalar elements
6447 if (Opcode == ISD::BITCAST) {
6448 V = V.getOperand(0);
6449 EVT SrcVT = V.getValueType();
6450 unsigned NumElems = VT.getVectorNumElements();
6452 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6456 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6457 return (Index == 0) ? V.getOperand(0)
6458 : DAG.getUNDEF(VT.getVectorElementType());
6460 if (V.getOpcode() == ISD::BUILD_VECTOR)
6461 return V.getOperand(Index);
6466 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6467 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6468 unsigned NumNonZero, unsigned NumZero,
6470 const X86Subtarget &Subtarget) {
6471 MVT VT = Op.getSimpleValueType();
6472 unsigned NumElts = VT.getVectorNumElements();
6473 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6474 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6475 "Illegal vector insertion");
6481 for (unsigned i = 0; i < NumElts; ++i) {
6482 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6486 // If the build vector contains zeros or our first insertion is not the
6487 // first index then insert into zero vector to break any register
6488 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6491 if (NumZero || 0 != i)
6492 V = getZeroVector(VT, Subtarget, DAG, dl);
6494 assert(0 == i && "Expected insertion into zero-index");
6495 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6496 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6497 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6498 V = DAG.getBitcast(VT, V);
6502 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6503 DAG.getIntPtrConstant(i, dl));
6509 /// Custom lower build_vector of v16i8.
6510 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6511 unsigned NumNonZero, unsigned NumZero,
6513 const X86Subtarget &Subtarget) {
6514 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6517 // SSE4.1 - use PINSRB to insert each byte directly.
6518 if (Subtarget.hasSSE41())
6519 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6526 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6527 for (unsigned i = 0; i < 16; ++i) {
6528 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6529 if (ThisIsNonZero && First) {
6531 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6533 V = DAG.getUNDEF(MVT::v8i16);
6538 // FIXME: Investigate extending to i32 instead of just i16.
6539 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6540 SDValue ThisElt, LastElt;
6541 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6542 if (LastIsNonZero) {
6544 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6546 if (ThisIsNonZero) {
6547 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6548 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6549 DAG.getConstant(8, dl, MVT::i8));
6551 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6557 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6558 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6559 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6560 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6561 V = DAG.getBitcast(MVT::v8i16, V);
6563 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6564 DAG.getIntPtrConstant(i / 2, dl));
6570 return DAG.getBitcast(MVT::v16i8, V);
6573 /// Custom lower build_vector of v8i16.
6574 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6575 unsigned NumNonZero, unsigned NumZero,
6577 const X86Subtarget &Subtarget) {
6578 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6581 // Use PINSRW to insert each byte directly.
6582 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6586 /// Custom lower build_vector of v4i32 or v4f32.
6587 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6588 const X86Subtarget &Subtarget) {
6589 // Find all zeroable elements.
6590 std::bitset<4> Zeroable;
6591 for (int i=0; i < 4; ++i) {
6592 SDValue Elt = Op->getOperand(i);
6593 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6595 assert(Zeroable.size() - Zeroable.count() > 1 &&
6596 "We expect at least two non-zero elements!");
6598 // We only know how to deal with build_vector nodes where elements are either
6599 // zeroable or extract_vector_elt with constant index.
6600 SDValue FirstNonZero;
6601 unsigned FirstNonZeroIdx;
6602 for (unsigned i=0; i < 4; ++i) {
6605 SDValue Elt = Op->getOperand(i);
6606 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6607 !isa<ConstantSDNode>(Elt.getOperand(1)))
6609 // Make sure that this node is extracting from a 128-bit vector.
6610 MVT VT = Elt.getOperand(0).getSimpleValueType();
6611 if (!VT.is128BitVector())
6613 if (!FirstNonZero.getNode()) {
6615 FirstNonZeroIdx = i;
6619 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6620 SDValue V1 = FirstNonZero.getOperand(0);
6621 MVT VT = V1.getSimpleValueType();
6623 // See if this build_vector can be lowered as a blend with zero.
6625 unsigned EltMaskIdx, EltIdx;
6627 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6628 if (Zeroable[EltIdx]) {
6629 // The zero vector will be on the right hand side.
6630 Mask[EltIdx] = EltIdx+4;
6634 Elt = Op->getOperand(EltIdx);
6635 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6636 EltMaskIdx = Elt.getConstantOperandVal(1);
6637 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6639 Mask[EltIdx] = EltIdx;
6643 // Let the shuffle legalizer deal with blend operations.
6644 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6645 if (V1.getSimpleValueType() != VT)
6646 V1 = DAG.getBitcast(VT, V1);
6647 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6650 // See if we can lower this build_vector to a INSERTPS.
6651 if (!Subtarget.hasSSE41())
6654 SDValue V2 = Elt.getOperand(0);
6655 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6658 bool CanFold = true;
6659 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6663 SDValue Current = Op->getOperand(i);
6664 SDValue SrcVector = Current->getOperand(0);
6667 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6673 assert(V1.getNode() && "Expected at least two non-zero elements!");
6674 if (V1.getSimpleValueType() != MVT::v4f32)
6675 V1 = DAG.getBitcast(MVT::v4f32, V1);
6676 if (V2.getSimpleValueType() != MVT::v4f32)
6677 V2 = DAG.getBitcast(MVT::v4f32, V2);
6679 // Ok, we can emit an INSERTPS instruction.
6680 unsigned ZMask = Zeroable.to_ulong();
6682 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6683 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6685 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6686 DAG.getIntPtrConstant(InsertPSMask, DL));
6687 return DAG.getBitcast(VT, Result);
6690 /// Return a vector logical shift node.
6691 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6692 SelectionDAG &DAG, const TargetLowering &TLI,
6694 assert(VT.is128BitVector() && "Unknown type for VShift");
6695 MVT ShVT = MVT::v16i8;
6696 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6697 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6698 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6699 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
6700 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6703 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6704 SelectionDAG &DAG) {
6706 // Check if the scalar load can be widened into a vector load. And if
6707 // the address is "base + cst" see if the cst can be "absorbed" into
6708 // the shuffle mask.
6709 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6710 SDValue Ptr = LD->getBasePtr();
6711 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6713 EVT PVT = LD->getValueType(0);
6714 if (PVT != MVT::i32 && PVT != MVT::f32)
6719 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6720 FI = FINode->getIndex();
6722 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6723 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6724 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6725 Offset = Ptr.getConstantOperandVal(1);
6726 Ptr = Ptr.getOperand(0);
6731 // FIXME: 256-bit vector instructions don't require a strict alignment,
6732 // improve this code to support it better.
6733 unsigned RequiredAlign = VT.getSizeInBits()/8;
6734 SDValue Chain = LD->getChain();
6735 // Make sure the stack object alignment is at least 16 or 32.
6736 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6737 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6738 if (MFI.isFixedObjectIndex(FI)) {
6739 // Can't change the alignment. FIXME: It's possible to compute
6740 // the exact stack offset and reference FI + adjust offset instead.
6741 // If someone *really* cares about this. That's the way to implement it.
6744 MFI.setObjectAlignment(FI, RequiredAlign);
6748 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6749 // Ptr + (Offset & ~15).
6752 if ((Offset % RequiredAlign) & 3)
6754 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6757 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6758 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6761 int EltNo = (Offset - StartOffset) >> 2;
6762 unsigned NumElems = VT.getVectorNumElements();
6764 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6765 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6766 LD->getPointerInfo().getWithOffset(StartOffset));
6768 SmallVector<int, 8> Mask(NumElems, EltNo);
6770 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6776 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6777 /// elements can be replaced by a single large load which has the same value as
6778 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6780 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6781 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6782 const SDLoc &DL, SelectionDAG &DAG,
6783 const X86Subtarget &Subtarget,
6784 bool isAfterLegalize) {
6785 unsigned NumElems = Elts.size();
6787 int LastLoadedElt = -1;
6788 SmallBitVector LoadMask(NumElems, false);
6789 SmallBitVector ZeroMask(NumElems, false);
6790 SmallBitVector UndefMask(NumElems, false);
6792 // For each element in the initializer, see if we've found a load, zero or an
6794 for (unsigned i = 0; i < NumElems; ++i) {
6795 SDValue Elt = peekThroughBitcasts(Elts[i]);
6800 UndefMask[i] = true;
6801 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6803 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6806 // Each loaded element must be the correct fractional portion of the
6807 // requested vector load.
6808 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6813 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6814 "Incomplete element masks");
6816 // Handle Special Cases - all undef or undef/zero.
6817 if (UndefMask.count() == NumElems)
6818 return DAG.getUNDEF(VT);
6820 // FIXME: Should we return this as a BUILD_VECTOR instead?
6821 if ((ZeroMask | UndefMask).count() == NumElems)
6822 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6823 : DAG.getConstantFP(0.0, DL, VT);
6825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6826 int FirstLoadedElt = LoadMask.find_first();
6827 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6828 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6829 EVT LDBaseVT = EltBase.getValueType();
6831 // Consecutive loads can contain UNDEFS but not ZERO elements.
6832 // Consecutive loads with UNDEFs and ZEROs elements require a
6833 // an additional shuffle stage to clear the ZERO elements.
6834 bool IsConsecutiveLoad = true;
6835 bool IsConsecutiveLoadWithZeros = true;
6836 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6838 SDValue Elt = peekThroughBitcasts(Elts[i]);
6839 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6840 if (!DAG.areNonVolatileConsecutiveLoads(
6841 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6842 i - FirstLoadedElt)) {
6843 IsConsecutiveLoad = false;
6844 IsConsecutiveLoadWithZeros = false;
6847 } else if (ZeroMask[i]) {
6848 IsConsecutiveLoad = false;
6852 SmallVector<LoadSDNode *, 8> Loads;
6853 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6855 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6857 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6858 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6859 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6860 "Cannot merge volatile loads.");
6862 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6863 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6864 for (auto *LD : Loads)
6865 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6869 // LOAD - all consecutive load/undefs (must start/end with a load).
6870 // If we have found an entire vector of loads and undefs, then return a large
6871 // load of the entire vector width starting at the base pointer.
6872 // If the vector contains zeros, then attempt to shuffle those elements.
6873 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6874 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6875 assert(LDBase && "Did not find base load for merging consecutive loads");
6876 EVT EltVT = LDBase->getValueType(0);
6877 // Ensure that the input vector size for the merged loads matches the
6878 // cumulative size of the input elements.
6879 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6882 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6885 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6886 // will lower to regular temporal loads and use the cache.
6887 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6888 VT.is256BitVector() && !Subtarget.hasInt256())
6891 if (IsConsecutiveLoad)
6892 return CreateLoad(VT, LDBase);
6894 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6895 // vector and a zero vector to clear out the zero elements.
6896 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6897 SmallVector<int, 4> ClearMask(NumElems, -1);
6898 for (unsigned i = 0; i < NumElems; ++i) {
6900 ClearMask[i] = i + NumElems;
6901 else if (LoadMask[i])
6904 SDValue V = CreateLoad(VT, LDBase);
6905 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6906 : DAG.getConstantFP(0.0, DL, VT);
6907 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6912 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6914 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6915 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6916 (LoadSize == 32 || LoadSize == 64) &&
6917 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6918 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6919 : MVT::getIntegerVT(LoadSize);
6920 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6921 if (TLI.isTypeLegal(VecVT)) {
6922 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6923 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6925 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6926 LDBase->getPointerInfo(),
6927 LDBase->getAlignment(),
6928 MachineMemOperand::MOLoad);
6929 for (auto *LD : Loads)
6930 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6931 return DAG.getBitcast(VT, ResNode);
6938 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6939 unsigned SplatBitSize, LLVMContext &C) {
6940 unsigned ScalarSize = VT.getScalarSizeInBits();
6941 unsigned NumElm = SplatBitSize / ScalarSize;
6943 SmallVector<Constant *, 32> ConstantVec;
6944 for (unsigned i = 0; i < NumElm; i++) {
6945 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6947 if (VT.isFloatingPoint()) {
6948 if (ScalarSize == 32) {
6949 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6951 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6952 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6955 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6956 ConstantVec.push_back(Const);
6958 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6961 static bool isUseOfShuffle(SDNode *N) {
6962 for (auto *U : N->uses()) {
6963 if (isTargetShuffle(U->getOpcode()))
6965 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6966 return isUseOfShuffle(U);
6971 // Check if the current node of build vector is a zero extended vector.
6972 // // If so, return the value extended.
6973 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6974 // // NumElt - return the number of zero extended identical values.
6975 // // EltType - return the type of the value include the zero extend.
6976 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6977 unsigned &NumElt, MVT &EltType) {
6978 SDValue ExtValue = Op->getOperand(0);
6979 unsigned NumElts = Op->getNumOperands();
6980 unsigned Delta = NumElts;
6982 for (unsigned i = 1; i < NumElts; i++) {
6983 if (Op->getOperand(i) == ExtValue) {
6987 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6990 if (!isPowerOf2_32(Delta) || Delta == 1)
6993 for (unsigned i = Delta; i < NumElts; i++) {
6994 if (i % Delta == 0) {
6995 if (Op->getOperand(i) != ExtValue)
6997 } else if (!(isNullConstant(Op->getOperand(i)) ||
6998 Op->getOperand(i).isUndef()))
7001 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
7002 unsigned ExtVTSize = EltSize * Delta;
7003 EltType = MVT::getIntegerVT(ExtVTSize);
7004 NumElt = NumElts / Delta;
7008 /// Attempt to use the vbroadcast instruction to generate a splat value
7009 /// from a splat BUILD_VECTOR which uses:
7010 /// a. A single scalar load, or a constant.
7011 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7013 /// The VBROADCAST node is returned when a pattern is found,
7014 /// or SDValue() otherwise.
7015 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7016 const X86Subtarget &Subtarget,
7017 SelectionDAG &DAG) {
7018 // VBROADCAST requires AVX.
7019 // TODO: Splats could be generated for non-AVX CPUs using SSE
7020 // instructions, but there's less potential gain for only 128-bit vectors.
7021 if (!Subtarget.hasAVX())
7024 MVT VT = BVOp->getSimpleValueType(0);
7027 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7028 "Unsupported vector type for broadcast.");
7030 BitVector UndefElements;
7031 SDValue Ld = BVOp->getSplatValue(&UndefElements);
7033 // Attempt to use VBROADCASTM
7034 // From this paterrn:
7035 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7036 // b. t1 = (build_vector t0 t0)
7038 // Create (VBROADCASTM v2i1 X)
7039 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7040 MVT EltType = VT.getScalarType();
7041 unsigned NumElts = VT.getVectorNumElements();
7043 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7044 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7045 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7046 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7048 BOperand = ZeroExtended.getOperand(0);
7050 BOperand = Ld.getOperand(0).getOperand(0);
7051 MVT MaskVT = BOperand.getSimpleValueType();
7052 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7053 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7055 DAG.getNode(X86ISD::VBROADCASTM, dl,
7056 MVT::getVectorVT(EltType, NumElts), BOperand);
7057 return DAG.getBitcast(VT, Brdcst);
7062 // We need a splat of a single value to use broadcast, and it doesn't
7063 // make any sense if the value is only in one element of the vector.
7064 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
7065 APInt SplatValue, Undef;
7066 unsigned SplatBitSize;
7068 // Check if this is a repeated constant pattern suitable for broadcasting.
7069 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7070 SplatBitSize > VT.getScalarSizeInBits() &&
7071 SplatBitSize < VT.getSizeInBits()) {
7072 // Avoid replacing with broadcast when it's a use of a shuffle
7073 // instruction to preserve the present custom lowering of shuffles.
7074 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7076 // replace BUILD_VECTOR with broadcast of the repeated constants.
7077 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7078 LLVMContext *Ctx = DAG.getContext();
7079 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7080 if (Subtarget.hasAVX()) {
7081 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7082 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7083 // Splatted value can fit in one INTEGER constant in constant pool.
7084 // Load the constant and broadcast it.
7085 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7086 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7087 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7088 SDValue CP = DAG.getConstantPool(C, PVT);
7089 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7091 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7093 CVT, dl, DAG.getEntryNode(), CP,
7094 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7096 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7097 MVT::getVectorVT(CVT, Repeat), Ld);
7098 return DAG.getBitcast(VT, Brdcst);
7099 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7100 // Splatted value can fit in one FLOAT constant in constant pool.
7101 // Load the constant and broadcast it.
7102 // AVX have support for 32 and 64 bit broadcast for floats only.
7103 // No 64bit integer in 32bit subtarget.
7104 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7105 // Lower the splat via APFloat directly, to avoid any conversion.
7108 ? ConstantFP::get(*Ctx,
7109 APFloat(APFloat::IEEEsingle(), SplatValue))
7110 : ConstantFP::get(*Ctx,
7111 APFloat(APFloat::IEEEdouble(), SplatValue));
7112 SDValue CP = DAG.getConstantPool(C, PVT);
7113 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7115 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7117 CVT, dl, DAG.getEntryNode(), CP,
7118 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7120 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7121 MVT::getVectorVT(CVT, Repeat), Ld);
7122 return DAG.getBitcast(VT, Brdcst);
7123 } else if (SplatBitSize > 64) {
7124 // Load the vector of constants and broadcast it.
7125 MVT CVT = VT.getScalarType();
7126 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7128 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7129 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7130 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7132 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7133 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7135 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7136 return DAG.getBitcast(VT, Brdcst);
7143 bool ConstSplatVal =
7144 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7146 // Make sure that all of the users of a non-constant load are from the
7147 // BUILD_VECTOR node.
7148 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7151 unsigned ScalarSize = Ld.getValueSizeInBits();
7152 bool IsGE256 = (VT.getSizeInBits() >= 256);
7154 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7155 // instruction to save 8 or more bytes of constant pool data.
7156 // TODO: If multiple splats are generated to load the same constant,
7157 // it may be detrimental to overall size. There needs to be a way to detect
7158 // that condition to know if this is truly a size win.
7159 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7161 // Handle broadcasting a single constant scalar from the constant pool
7163 // On Sandybridge (no AVX2), it is still better to load a constant vector
7164 // from the constant pool and not to broadcast it from a scalar.
7165 // But override that restriction when optimizing for size.
7166 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7167 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7168 EVT CVT = Ld.getValueType();
7169 assert(!CVT.isVector() && "Must not broadcast a vector type");
7171 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7172 // For size optimization, also splat v2f64 and v2i64, and for size opt
7173 // with AVX2, also splat i8 and i16.
7174 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7175 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7176 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7177 const Constant *C = nullptr;
7178 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7179 C = CI->getConstantIntValue();
7180 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7181 C = CF->getConstantFPValue();
7183 assert(C && "Invalid constant type");
7185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7187 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7188 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7190 CVT, dl, DAG.getEntryNode(), CP,
7191 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7194 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7198 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7200 // Handle AVX2 in-register broadcasts.
7201 if (!IsLoad && Subtarget.hasInt256() &&
7202 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7203 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7205 // The scalar source must be a normal load.
7209 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7210 (Subtarget.hasVLX() && ScalarSize == 64))
7211 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7213 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7214 // double since there is no vbroadcastsd xmm
7215 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7216 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7217 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7220 // Unsupported broadcast.
7224 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7225 /// underlying vector and index.
7227 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7229 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7231 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7232 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7235 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7237 // (extract_vector_elt (v8f32 %1), Constant<6>)
7239 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7240 // (extract_subvector (v8f32 %0), Constant<4>),
7243 // In this case the vector is the extract_subvector expression and the index
7244 // is 2, as specified by the shuffle.
7245 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7246 SDValue ShuffleVec = SVOp->getOperand(0);
7247 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7248 assert(ShuffleVecVT.getVectorElementType() ==
7249 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7251 int ShuffleIdx = SVOp->getMaskElt(Idx);
7252 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7253 ExtractedFromVec = ShuffleVec;
7259 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7260 MVT VT = Op.getSimpleValueType();
7262 // Skip if insert_vec_elt is not supported.
7263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7264 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7268 unsigned NumElems = Op.getNumOperands();
7272 SmallVector<unsigned, 4> InsertIndices;
7273 SmallVector<int, 8> Mask(NumElems, -1);
7275 for (unsigned i = 0; i != NumElems; ++i) {
7276 unsigned Opc = Op.getOperand(i).getOpcode();
7278 if (Opc == ISD::UNDEF)
7281 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7282 // Quit if more than 1 elements need inserting.
7283 if (InsertIndices.size() > 1)
7286 InsertIndices.push_back(i);
7290 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7291 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7293 // Quit if non-constant index.
7294 if (!isa<ConstantSDNode>(ExtIdx))
7296 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7298 // Quit if extracted from vector of different type.
7299 if (ExtractedFromVec.getValueType() != VT)
7302 if (!VecIn1.getNode())
7303 VecIn1 = ExtractedFromVec;
7304 else if (VecIn1 != ExtractedFromVec) {
7305 if (!VecIn2.getNode())
7306 VecIn2 = ExtractedFromVec;
7307 else if (VecIn2 != ExtractedFromVec)
7308 // Quit if more than 2 vectors to shuffle
7312 if (ExtractedFromVec == VecIn1)
7314 else if (ExtractedFromVec == VecIn2)
7315 Mask[i] = Idx + NumElems;
7318 if (!VecIn1.getNode())
7321 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7322 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7324 for (unsigned Idx : InsertIndices)
7325 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7326 DAG.getIntPtrConstant(Idx, DL));
7331 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7332 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7333 Op.getScalarValueSizeInBits() == 1 &&
7334 "Can not convert non-constant vector");
7335 uint64_t Immediate = 0;
7336 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7337 SDValue In = Op.getOperand(idx);
7339 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7342 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7343 return DAG.getConstant(Immediate, dl, VT);
7345 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7346 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7347 const X86Subtarget &Subtarget) {
7349 MVT VT = Op.getSimpleValueType();
7350 assert((VT.getVectorElementType() == MVT::i1) &&
7351 "Unexpected type in LowerBUILD_VECTORvXi1!");
7354 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7357 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7360 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7361 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7362 // Split the pieces.
7364 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7366 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7367 // We have to manually lower both halves so getNode doesn't try to
7368 // reassemble the build_vector.
7369 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7370 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7371 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7373 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7374 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7375 return DAG.getBitcast(VT, Imm);
7376 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7378 DAG.getIntPtrConstant(0, dl));
7381 // Vector has one or more non-const elements
7382 uint64_t Immediate = 0;
7383 SmallVector<unsigned, 16> NonConstIdx;
7384 bool IsSplat = true;
7385 bool HasConstElts = false;
7387 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7388 SDValue In = Op.getOperand(idx);
7391 if (!isa<ConstantSDNode>(In))
7392 NonConstIdx.push_back(idx);
7394 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7395 HasConstElts = true;
7399 else if (In != Op.getOperand(SplatIdx))
7403 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7405 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7406 DAG.getConstant(1, dl, VT),
7407 DAG.getConstant(0, dl, VT));
7409 // insert elements one by one
7413 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7414 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7416 else if (HasConstElts)
7417 Imm = DAG.getConstant(0, dl, VT);
7419 Imm = DAG.getUNDEF(VT);
7420 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7421 DstVec = DAG.getBitcast(VT, Imm);
7423 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7424 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7425 DAG.getIntPtrConstant(0, dl));
7428 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7429 unsigned InsertIdx = NonConstIdx[i];
7430 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7431 Op.getOperand(InsertIdx),
7432 DAG.getIntPtrConstant(InsertIdx, dl));
7437 /// Return true if \p N implements a horizontal binop and return the
7438 /// operands for the horizontal binop into V0 and V1.
7440 /// This is a helper function of LowerToHorizontalOp().
7441 /// This function checks that the build_vector \p N in input implements a
7442 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7443 /// operation to match.
7444 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7445 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7446 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7449 /// This function only analyzes elements of \p N whose indices are
7450 /// in range [BaseIdx, LastIdx).
7451 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7453 unsigned BaseIdx, unsigned LastIdx,
7454 SDValue &V0, SDValue &V1) {
7455 EVT VT = N->getValueType(0);
7457 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7458 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7459 "Invalid Vector in input!");
7461 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7462 bool CanFold = true;
7463 unsigned ExpectedVExtractIdx = BaseIdx;
7464 unsigned NumElts = LastIdx - BaseIdx;
7465 V0 = DAG.getUNDEF(VT);
7466 V1 = DAG.getUNDEF(VT);
7468 // Check if N implements a horizontal binop.
7469 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7470 SDValue Op = N->getOperand(i + BaseIdx);
7473 if (Op->isUndef()) {
7474 // Update the expected vector extract index.
7475 if (i * 2 == NumElts)
7476 ExpectedVExtractIdx = BaseIdx;
7477 ExpectedVExtractIdx += 2;
7481 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7486 SDValue Op0 = Op.getOperand(0);
7487 SDValue Op1 = Op.getOperand(1);
7489 // Try to match the following pattern:
7490 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7491 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7492 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7493 Op0.getOperand(0) == Op1.getOperand(0) &&
7494 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7495 isa<ConstantSDNode>(Op1.getOperand(1)));
7499 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7500 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7502 if (i * 2 < NumElts) {
7504 V0 = Op0.getOperand(0);
7505 if (V0.getValueType() != VT)
7510 V1 = Op0.getOperand(0);
7511 if (V1.getValueType() != VT)
7514 if (i * 2 == NumElts)
7515 ExpectedVExtractIdx = BaseIdx;
7518 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7519 if (I0 == ExpectedVExtractIdx)
7520 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7521 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7522 // Try to match the following dag sequence:
7523 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7524 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7528 ExpectedVExtractIdx += 2;
7534 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7535 /// a concat_vector.
7537 /// This is a helper function of LowerToHorizontalOp().
7538 /// This function expects two 256-bit vectors called V0 and V1.
7539 /// At first, each vector is split into two separate 128-bit vectors.
7540 /// Then, the resulting 128-bit vectors are used to implement two
7541 /// horizontal binary operations.
7543 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7545 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7546 /// the two new horizontal binop.
7547 /// When Mode is set, the first horizontal binop dag node would take as input
7548 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7549 /// horizontal binop dag node would take as input the lower 128-bit of V1
7550 /// and the upper 128-bit of V1.
7552 /// HADD V0_LO, V0_HI
7553 /// HADD V1_LO, V1_HI
7555 /// Otherwise, the first horizontal binop dag node takes as input the lower
7556 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7557 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7559 /// HADD V0_LO, V1_LO
7560 /// HADD V0_HI, V1_HI
7562 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7563 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7564 /// the upper 128-bits of the result.
7565 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7566 const SDLoc &DL, SelectionDAG &DAG,
7567 unsigned X86Opcode, bool Mode,
7568 bool isUndefLO, bool isUndefHI) {
7569 MVT VT = V0.getSimpleValueType();
7570 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7571 "Invalid nodes in input!");
7573 unsigned NumElts = VT.getVectorNumElements();
7574 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7575 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7576 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7577 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7578 MVT NewVT = V0_LO.getSimpleValueType();
7580 SDValue LO = DAG.getUNDEF(NewVT);
7581 SDValue HI = DAG.getUNDEF(NewVT);
7584 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7585 if (!isUndefLO && !V0->isUndef())
7586 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7587 if (!isUndefHI && !V1->isUndef())
7588 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7590 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7591 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7592 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7594 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7595 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7598 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7601 /// Returns true iff \p BV builds a vector with the result equivalent to
7602 /// the result of ADDSUB/SUBADD operation.
7603 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7604 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7605 /// \p Opnd0 and \p Opnd1.
7606 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7607 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7608 SDValue &Opnd0, SDValue &Opnd1,
7609 unsigned &NumExtracts,
7612 MVT VT = BV->getSimpleValueType(0);
7613 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7616 unsigned NumElts = VT.getVectorNumElements();
7617 SDValue InVec0 = DAG.getUNDEF(VT);
7618 SDValue InVec1 = DAG.getUNDEF(VT);
7622 // Odd-numbered elements in the input build vector are obtained from
7623 // adding/subtracting two integer/float elements.
7624 // Even-numbered elements in the input build vector are obtained from
7625 // subtracting/adding two integer/float elements.
7626 unsigned Opc[2] {0, 0};
7627 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7628 SDValue Op = BV->getOperand(i);
7630 // Skip 'undef' values.
7631 unsigned Opcode = Op.getOpcode();
7632 if (Opcode == ISD::UNDEF)
7635 // Early exit if we found an unexpected opcode.
7636 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7639 SDValue Op0 = Op.getOperand(0);
7640 SDValue Op1 = Op.getOperand(1);
7642 // Try to match the following pattern:
7643 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7644 // Early exit if we cannot match that sequence.
7645 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7646 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7647 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7648 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7649 Op0.getOperand(1) != Op1.getOperand(1))
7652 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7656 // We found a valid add/sub node, make sure its the same opcode as previous
7657 // elements for this parity.
7658 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7660 Opc[i % 2] = Opcode;
7662 // Update InVec0 and InVec1.
7663 if (InVec0.isUndef()) {
7664 InVec0 = Op0.getOperand(0);
7665 if (InVec0.getSimpleValueType() != VT)
7668 if (InVec1.isUndef()) {
7669 InVec1 = Op1.getOperand(0);
7670 if (InVec1.getSimpleValueType() != VT)
7674 // Make sure that operands in input to each add/sub node always
7675 // come from a same pair of vectors.
7676 if (InVec0 != Op0.getOperand(0)) {
7677 if (Opcode == ISD::FSUB)
7680 // FADD is commutable. Try to commute the operands
7681 // and then test again.
7682 std::swap(Op0, Op1);
7683 if (InVec0 != Op0.getOperand(0))
7687 if (InVec1 != Op1.getOperand(0))
7690 // Increment the number of extractions done.
7694 // Ensure we have found an opcode for both parities and that they are
7695 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7696 // inputs are undef.
7697 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7698 InVec0.isUndef() || InVec1.isUndef())
7701 IsSubAdd = Opc[0] == ISD::FADD;
7708 /// Returns true if is possible to fold MUL and an idiom that has already been
7709 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7710 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7711 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7713 /// Prior to calling this function it should be known that there is some
7714 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7715 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7716 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7717 /// of \p Opnd0 uses is expected to be equal to 2.
7718 /// For example, this function may be called for the following IR:
7719 /// %AB = fmul fast <2 x double> %A, %B
7720 /// %Sub = fsub fast <2 x double> %AB, %C
7721 /// %Add = fadd fast <2 x double> %AB, %C
7722 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7723 /// <2 x i32> <i32 0, i32 3>
7724 /// There is a def for %Addsub here, which potentially can be replaced by
7725 /// X86ISD::ADDSUB operation:
7726 /// %Addsub = X86ISD::ADDSUB %AB, %C
7727 /// and such ADDSUB can further be replaced with FMADDSUB:
7728 /// %Addsub = FMADDSUB %A, %B, %C.
7730 /// The main reason why this method is called before the replacement of the
7731 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7732 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7734 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7736 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7737 unsigned ExpectedUses) {
7738 if (Opnd0.getOpcode() != ISD::FMUL ||
7739 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7742 // FIXME: These checks must match the similar ones in
7743 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7744 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7745 // or MUL + ADDSUB to FMADDSUB.
7746 const TargetOptions &Options = DAG.getTarget().Options;
7748 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7753 Opnd1 = Opnd0.getOperand(1);
7754 Opnd0 = Opnd0.getOperand(0);
7759 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7760 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7761 /// X86ISD::FMSUBADD node.
7762 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7763 const X86Subtarget &Subtarget,
7764 SelectionDAG &DAG) {
7765 SDValue Opnd0, Opnd1;
7766 unsigned NumExtracts;
7768 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7772 MVT VT = BV->getSimpleValueType(0);
7775 // Try to generate X86ISD::FMADDSUB node here.
7777 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7778 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7779 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7782 // We only support ADDSUB.
7786 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7787 // the ADDSUB idiom has been successfully recognized. There are no known
7788 // X86 targets with 512-bit ADDSUB instructions!
7789 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7791 if (VT.is512BitVector())
7794 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7797 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7798 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7799 const X86Subtarget &Subtarget,
7800 SelectionDAG &DAG) {
7801 MVT VT = BV->getSimpleValueType(0);
7802 unsigned NumElts = VT.getVectorNumElements();
7803 unsigned NumUndefsLO = 0;
7804 unsigned NumUndefsHI = 0;
7805 unsigned Half = NumElts/2;
7807 // Count the number of UNDEF operands in the build_vector in input.
7808 for (unsigned i = 0, e = Half; i != e; ++i)
7809 if (BV->getOperand(i)->isUndef())
7812 for (unsigned i = Half, e = NumElts; i != e; ++i)
7813 if (BV->getOperand(i)->isUndef())
7816 // Early exit if this is either a build_vector of all UNDEFs or all the
7817 // operands but one are UNDEF.
7818 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7822 SDValue InVec0, InVec1;
7823 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7824 // Try to match an SSE3 float HADD/HSUB.
7825 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7826 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7828 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7829 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7830 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7831 // Try to match an SSSE3 integer HADD/HSUB.
7832 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7833 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7835 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7836 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7839 if (!Subtarget.hasAVX())
7842 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7843 // Try to match an AVX horizontal add/sub of packed single/double
7844 // precision floating point values from 256-bit vectors.
7845 SDValue InVec2, InVec3;
7846 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7847 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7848 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7849 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7850 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7852 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7853 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7854 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7855 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7856 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7857 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7858 // Try to match an AVX2 horizontal add/sub of signed integers.
7859 SDValue InVec2, InVec3;
7861 bool CanFold = true;
7863 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7864 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7865 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7866 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7867 X86Opcode = X86ISD::HADD;
7868 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7869 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7870 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7871 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7872 X86Opcode = X86ISD::HSUB;
7877 // Fold this build_vector into a single horizontal add/sub.
7878 // Do this only if the target has AVX2.
7879 if (Subtarget.hasAVX2())
7880 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7882 // Do not try to expand this build_vector into a pair of horizontal
7883 // add/sub if we can emit a pair of scalar add/sub.
7884 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7887 // Convert this build_vector into a pair of horizontal binop followed by
7889 bool isUndefLO = NumUndefsLO == Half;
7890 bool isUndefHI = NumUndefsHI == Half;
7891 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7892 isUndefLO, isUndefHI);
7896 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7897 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7899 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7900 X86Opcode = X86ISD::HADD;
7901 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7902 X86Opcode = X86ISD::HSUB;
7903 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7904 X86Opcode = X86ISD::FHADD;
7905 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7906 X86Opcode = X86ISD::FHSUB;
7910 // Don't try to expand this build_vector into a pair of horizontal add/sub
7911 // if we can simply emit a pair of scalar add/sub.
7912 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7915 // Convert this build_vector into two horizontal add/sub followed by
7917 bool isUndefLO = NumUndefsLO == Half;
7918 bool isUndefHI = NumUndefsHI == Half;
7919 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7920 isUndefLO, isUndefHI);
7926 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7927 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7928 /// just apply the bit to the vectors.
7929 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7930 /// from this, but enough scalar bit operations are created from the later
7931 /// legalization + scalarization stages to need basic support.
7932 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7933 SelectionDAG &DAG) {
7935 MVT VT = Op->getSimpleValueType(0);
7936 unsigned NumElems = VT.getVectorNumElements();
7937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7939 // Check that all elements have the same opcode.
7940 // TODO: Should we allow UNDEFS and if so how many?
7941 unsigned Opcode = Op->getOperand(0).getOpcode();
7942 for (unsigned i = 1; i < NumElems; ++i)
7943 if (Opcode != Op->getOperand(i).getOpcode())
7946 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7953 // Don't do this if the buildvector is a splat - we'd replace one
7954 // constant with an entire vector.
7955 if (Op->getSplatValue())
7957 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7962 SmallVector<SDValue, 4> LHSElts, RHSElts;
7963 for (SDValue Elt : Op->ops()) {
7964 SDValue LHS = Elt.getOperand(0);
7965 SDValue RHS = Elt.getOperand(1);
7967 // We expect the canonicalized RHS operand to be the constant.
7968 if (!isa<ConstantSDNode>(RHS))
7970 LHSElts.push_back(LHS);
7971 RHSElts.push_back(RHS);
7974 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7975 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7976 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7979 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7980 /// functionality to do this, so it's all zeros, all ones, or some derivation
7981 /// that is cheap to calculate.
7982 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7983 const X86Subtarget &Subtarget) {
7985 MVT VT = Op.getSimpleValueType();
7987 // Vectors containing all zeros can be matched by pxor and xorps.
7988 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7989 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7990 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7991 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7994 return getZeroVector(VT, Subtarget, DAG, DL);
7997 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7998 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7999 // vpcmpeqd on 256-bit vectors.
8000 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8001 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
8002 (VT == MVT::v8i32 && Subtarget.hasInt256()))
8005 return getOnesVector(VT, DAG, DL);
8011 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8012 /// from a vector of source values and a vector of extraction indices.
8013 /// The vectors might be manipulated to match the type of the permute op.
8014 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8015 SDLoc &DL, SelectionDAG &DAG,
8016 const X86Subtarget &Subtarget) {
8018 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8019 unsigned NumElts = VT.getVectorNumElements();
8020 unsigned SizeInBits = VT.getSizeInBits();
8022 // Adjust IndicesVec to match VT size.
8023 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8024 "Illegal variable permute mask size");
8025 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8026 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8027 NumElts * VT.getScalarSizeInBits());
8028 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8030 // Handle SrcVec that don't match VT type.
8031 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8032 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8033 // Handle larger SrcVec by treating it as a larger permute.
8034 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8035 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8036 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8037 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8038 Subtarget, DAG, SDLoc(IndicesVec));
8039 return extractSubVector(
8040 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8041 DAG, DL, SizeInBits);
8042 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8043 // Widen smaller SrcVec to match VT.
8044 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8049 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8050 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8051 EVT SrcVT = Idx.getValueType();
8052 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8053 uint64_t IndexScale = 0;
8054 uint64_t IndexOffset = 0;
8056 // If we're scaling a smaller permute op, then we need to repeat the
8057 // indices, scaling and offsetting them as well.
8058 // e.g. v4i32 -> v16i8 (Scale = 4)
8059 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8060 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8061 for (uint64_t i = 0; i != Scale; ++i) {
8062 IndexScale |= Scale << (i * NumDstBits);
8063 IndexOffset |= i << (i * NumDstBits);
8066 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8067 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8068 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8069 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8073 unsigned Opcode = 0;
8074 switch (VT.SimpleTy) {
8078 if (Subtarget.hasSSSE3())
8079 Opcode = X86ISD::PSHUFB;
8082 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8083 Opcode = X86ISD::VPERMV;
8084 else if (Subtarget.hasSSSE3()) {
8085 Opcode = X86ISD::PSHUFB;
8086 ShuffleVT = MVT::v16i8;
8091 if (Subtarget.hasAVX()) {
8092 Opcode = X86ISD::VPERMILPV;
8093 ShuffleVT = MVT::v4f32;
8094 } else if (Subtarget.hasSSSE3()) {
8095 Opcode = X86ISD::PSHUFB;
8096 ShuffleVT = MVT::v16i8;
8101 if (Subtarget.hasAVX()) {
8102 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8103 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8104 Opcode = X86ISD::VPERMILPV;
8105 ShuffleVT = MVT::v2f64;
8106 } else if (Subtarget.hasSSE41()) {
8107 // SSE41 can compare v2i64 - select between indices 0 and 1.
8108 return DAG.getSelectCC(
8110 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8111 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8112 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8113 ISD::CondCode::SETEQ);
8117 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8118 Opcode = X86ISD::VPERMV;
8119 else if (Subtarget.hasXOP()) {
8120 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8121 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8122 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8123 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8125 ISD::CONCAT_VECTORS, DL, VT,
8126 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8127 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8128 } else if (Subtarget.hasAVX()) {
8129 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8130 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8131 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8132 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8133 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8134 ArrayRef<SDValue> Ops) {
8135 // Permute Lo and Hi and then select based on index range.
8136 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8137 // care about the bit[7] as its just an index vector.
8138 SDValue Idx = Ops[2];
8139 EVT VT = Idx.getValueType();
8140 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8141 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8142 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8143 ISD::CondCode::SETGT);
8145 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8146 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8151 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8152 Opcode = X86ISD::VPERMV;
8153 else if (Subtarget.hasAVX()) {
8154 // Scale to v32i8 and perform as v32i8.
8155 IndicesVec = ScaleIndices(IndicesVec, 2);
8156 return DAG.getBitcast(
8157 VT, createVariablePermute(
8158 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8159 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8164 if (Subtarget.hasAVX2())
8165 Opcode = X86ISD::VPERMV;
8166 else if (Subtarget.hasAVX()) {
8167 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8168 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8169 {0, 1, 2, 3, 0, 1, 2, 3});
8170 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8171 {4, 5, 6, 7, 4, 5, 6, 7});
8172 if (Subtarget.hasXOP())
8173 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8174 LoLo, HiHi, IndicesVec,
8175 DAG.getConstant(0, DL, MVT::i8)));
8176 // Permute Lo and Hi and then select based on index range.
8177 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8178 SDValue Res = DAG.getSelectCC(
8179 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8180 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8181 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8182 ISD::CondCode::SETGT);
8183 return DAG.getBitcast(VT, Res);
8188 if (Subtarget.hasAVX512()) {
8189 if (!Subtarget.hasVLX()) {
8190 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8191 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8193 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8194 DAG, SDLoc(IndicesVec));
8195 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8197 return extract256BitVector(Res, 0, DAG, DL);
8199 Opcode = X86ISD::VPERMV;
8200 } else if (Subtarget.hasAVX()) {
8201 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8203 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8205 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8206 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8207 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8208 if (Subtarget.hasXOP())
8209 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8210 LoLo, HiHi, IndicesVec,
8211 DAG.getConstant(0, DL, MVT::i8)));
8212 // Permute Lo and Hi and then select based on index range.
8213 // This works as VPERMILPD only uses index bit[1] to permute elements.
8214 SDValue Res = DAG.getSelectCC(
8215 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8216 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8217 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8218 ISD::CondCode::SETGT);
8219 return DAG.getBitcast(VT, Res);
8223 if (Subtarget.hasVBMI())
8224 Opcode = X86ISD::VPERMV;
8227 if (Subtarget.hasBWI())
8228 Opcode = X86ISD::VPERMV;
8234 if (Subtarget.hasAVX512())
8235 Opcode = X86ISD::VPERMV;
8241 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8242 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8243 "Illegal variable permute shuffle type");
8245 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8247 IndicesVec = ScaleIndices(IndicesVec, Scale);
8249 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8250 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8252 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8253 SDValue Res = Opcode == X86ISD::VPERMV
8254 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8255 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8256 return DAG.getBitcast(VT, Res);
8259 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8260 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8261 // (build_vector (extract_elt V, (extract_elt I, 0)),
8262 // (extract_elt V, (extract_elt I, 1)),
8267 // TODO: Handle undefs
8268 // TODO: Utilize pshufb and zero mask blending to support more efficient
8269 // construction of vectors with constant-0 elements.
8271 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8272 const X86Subtarget &Subtarget) {
8273 SDValue SrcVec, IndicesVec;
8274 // Check for a match of the permute source vector and permute index elements.
8275 // This is done by checking that the i-th build_vector operand is of the form:
8276 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8277 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8278 SDValue Op = V.getOperand(Idx);
8279 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8282 // If this is the first extract encountered in V, set the source vector,
8283 // otherwise verify the extract is from the previously defined source
8286 SrcVec = Op.getOperand(0);
8287 else if (SrcVec != Op.getOperand(0))
8289 SDValue ExtractedIndex = Op->getOperand(1);
8290 // Peek through extends.
8291 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8292 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8293 ExtractedIndex = ExtractedIndex.getOperand(0);
8294 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8297 // If this is the first extract from the index vector candidate, set the
8298 // indices vector, otherwise verify the extract is from the previously
8299 // defined indices vector.
8301 IndicesVec = ExtractedIndex.getOperand(0);
8302 else if (IndicesVec != ExtractedIndex.getOperand(0))
8305 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8306 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8311 MVT VT = V.getSimpleValueType();
8312 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8316 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8319 MVT VT = Op.getSimpleValueType();
8320 MVT EltVT = VT.getVectorElementType();
8321 unsigned NumElems = Op.getNumOperands();
8323 // Generate vectors for predicate vectors.
8324 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8325 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8327 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8328 return VectorConstant;
8330 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8331 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8333 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8334 return HorizontalOp;
8335 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8337 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8340 unsigned EVTBits = EltVT.getSizeInBits();
8342 unsigned NumZero = 0;
8343 unsigned NumNonZero = 0;
8344 uint64_t NonZeros = 0;
8345 bool IsAllConstants = true;
8346 SmallSet<SDValue, 8> Values;
8347 unsigned NumConstants = NumElems;
8348 for (unsigned i = 0; i < NumElems; ++i) {
8349 SDValue Elt = Op.getOperand(i);
8353 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8354 IsAllConstants = false;
8357 if (X86::isZeroNode(Elt))
8360 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8361 NonZeros |= ((uint64_t)1 << i);
8366 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8367 if (NumNonZero == 0)
8368 return DAG.getUNDEF(VT);
8370 // If we are inserting one variable into a vector of non-zero constants, try
8371 // to avoid loading each constant element as a scalar. Load the constants as a
8372 // vector and then insert the variable scalar element. If insertion is not
8373 // supported, we assume that we will fall back to a shuffle to get the scalar
8374 // blended with the constants. Insertion into a zero vector is handled as a
8375 // special-case somewhere below here.
8376 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8377 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8378 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8379 // Create an all-constant vector. The variable element in the old
8380 // build vector is replaced by undef in the constant vector. Save the
8381 // variable scalar element and its index for use in the insertelement.
8382 LLVMContext &Context = *DAG.getContext();
8383 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8384 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8387 for (unsigned i = 0; i != NumElems; ++i) {
8388 SDValue Elt = Op.getOperand(i);
8389 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8390 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8391 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8392 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8393 else if (!Elt.isUndef()) {
8394 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8395 "Expected one variable element in this vector");
8397 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8400 Constant *CV = ConstantVector::get(ConstVecOps);
8401 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8403 // The constants we just created may not be legal (eg, floating point). We
8404 // must lower the vector right here because we can not guarantee that we'll
8405 // legalize it before loading it. This is also why we could not just create
8406 // a new build vector here. If the build vector contains illegal constants,
8407 // it could get split back up into a series of insert elements.
8408 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8409 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8410 MachineFunction &MF = DAG.getMachineFunction();
8411 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8412 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8413 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8416 // Special case for single non-zero, non-undef, element.
8417 if (NumNonZero == 1) {
8418 unsigned Idx = countTrailingZeros(NonZeros);
8419 SDValue Item = Op.getOperand(Idx);
8421 // If we have a constant or non-constant insertion into the low element of
8422 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8423 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8424 // depending on what the source datatype is.
8427 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8429 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8430 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8431 assert((VT.is128BitVector() || VT.is256BitVector() ||
8432 VT.is512BitVector()) &&
8433 "Expected an SSE value type!");
8434 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8435 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8436 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8439 // We can't directly insert an i8 or i16 into a vector, so zero extend
8441 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8442 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8443 if (VT.getSizeInBits() >= 256) {
8444 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8445 if (Subtarget.hasAVX()) {
8446 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8447 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8449 // Without AVX, we need to extend to a 128-bit vector and then
8450 // insert into the 256-bit vector.
8451 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8452 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8453 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8456 assert(VT.is128BitVector() && "Expected an SSE value type!");
8457 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8458 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8460 return DAG.getBitcast(VT, Item);
8464 // Is it a vector logical left shift?
8465 if (NumElems == 2 && Idx == 1 &&
8466 X86::isZeroNode(Op.getOperand(0)) &&
8467 !X86::isZeroNode(Op.getOperand(1))) {
8468 unsigned NumBits = VT.getSizeInBits();
8469 return getVShift(true, VT,
8470 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8471 VT, Op.getOperand(1)),
8472 NumBits/2, DAG, *this, dl);
8475 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8478 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8479 // is a non-constant being inserted into an element other than the low one,
8480 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8481 // movd/movss) to move this into the low element, then shuffle it into
8483 if (EVTBits == 32) {
8484 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8485 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8489 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8490 if (Values.size() == 1) {
8491 if (EVTBits == 32) {
8492 // Instead of a shuffle like this:
8493 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8494 // Check if it's possible to issue this instead.
8495 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8496 unsigned Idx = countTrailingZeros(NonZeros);
8497 SDValue Item = Op.getOperand(Idx);
8498 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8499 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8504 // A vector full of immediates; various special cases are already
8505 // handled, so this is best done with a single constant-pool load.
8509 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8512 // See if we can use a vector load to get all of the elements.
8514 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8516 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8520 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8521 // build_vector and broadcast it.
8522 // TODO: We could probably generalize this more.
8523 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8524 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8525 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8526 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8527 // Make sure all the even/odd operands match.
8528 for (unsigned i = 2; i != NumElems; ++i)
8529 if (Ops[i % 2] != Op.getOperand(i))
8533 if (CanSplat(Op, NumElems, Ops)) {
8534 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8535 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8536 // Create a new build vector and cast to v2i64/v2f64.
8537 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8538 DAG.getBuildVector(NarrowVT, dl, Ops));
8539 // Broadcast from v2i64/v2f64 and cast to final VT.
8540 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8541 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8546 // For AVX-length vectors, build the individual 128-bit pieces and use
8547 // shuffles to put them in place.
8548 if (VT.getSizeInBits() > 128) {
8549 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8551 // Build both the lower and upper subvector.
8553 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8554 SDValue Upper = DAG.getBuildVector(
8555 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8557 // Recreate the wider vector with the lower and upper part.
8558 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8559 VT.getSizeInBits() / 2);
8562 // Let legalizer expand 2-wide build_vectors.
8563 if (EVTBits == 64) {
8564 if (NumNonZero == 1) {
8565 // One half is zero or undef.
8566 unsigned Idx = countTrailingZeros(NonZeros);
8567 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8568 Op.getOperand(Idx));
8569 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8574 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8575 if (EVTBits == 8 && NumElems == 16)
8576 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8580 if (EVTBits == 16 && NumElems == 8)
8581 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8585 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8586 if (EVTBits == 32 && NumElems == 4)
8587 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8590 // If element VT is == 32 bits, turn it into a number of shuffles.
8591 if (NumElems == 4 && NumZero > 0) {
8592 SmallVector<SDValue, 8> Ops(NumElems);
8593 for (unsigned i = 0; i < 4; ++i) {
8594 bool isZero = !(NonZeros & (1ULL << i));
8596 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8598 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8601 for (unsigned i = 0; i < 2; ++i) {
8602 switch ((NonZeros >> (i*2)) & 0x3) {
8603 default: llvm_unreachable("Unexpected NonZero count");
8605 Ops[i] = Ops[i*2]; // Must be a zero vector.
8608 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8611 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8614 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8619 bool Reverse1 = (NonZeros & 0x3) == 2;
8620 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8624 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8625 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8627 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8630 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8632 // Check for a build vector from mostly shuffle plus few inserting.
8633 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8636 // For SSE 4.1, use insertps to put the high elements into the low element.
8637 if (Subtarget.hasSSE41()) {
8639 if (!Op.getOperand(0).isUndef())
8640 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8642 Result = DAG.getUNDEF(VT);
8644 for (unsigned i = 1; i < NumElems; ++i) {
8645 if (Op.getOperand(i).isUndef()) continue;
8646 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8647 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8652 // Otherwise, expand into a number of unpckl*, start by extending each of
8653 // our (non-undef) elements to the full vector width with the element in the
8654 // bottom slot of the vector (which generates no code for SSE).
8655 SmallVector<SDValue, 8> Ops(NumElems);
8656 for (unsigned i = 0; i < NumElems; ++i) {
8657 if (!Op.getOperand(i).isUndef())
8658 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8660 Ops[i] = DAG.getUNDEF(VT);
8663 // Next, we iteratively mix elements, e.g. for v4f32:
8664 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8665 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8666 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8667 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8668 // Generate scaled UNPCKL shuffle mask.
8669 SmallVector<int, 16> Mask;
8670 for(unsigned i = 0; i != Scale; ++i)
8672 for (unsigned i = 0; i != Scale; ++i)
8673 Mask.push_back(NumElems+i);
8674 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8676 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8677 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8682 // 256-bit AVX can use the vinsertf128 instruction
8683 // to create 256-bit vectors from two other 128-bit ones.
8684 // TODO: Detect subvector broadcast here instead of DAG combine?
8685 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8686 const X86Subtarget &Subtarget) {
8688 MVT ResVT = Op.getSimpleValueType();
8690 assert((ResVT.is256BitVector() ||
8691 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8693 unsigned NumOperands = Op.getNumOperands();
8694 unsigned NumZero = 0;
8695 unsigned NumNonZero = 0;
8696 unsigned NonZeros = 0;
8697 for (unsigned i = 0; i != NumOperands; ++i) {
8698 SDValue SubVec = Op.getOperand(i);
8699 if (SubVec.isUndef())
8701 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8704 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8710 // If we have more than 2 non-zeros, build each half separately.
8711 if (NumNonZero > 2) {
8712 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8713 ResVT.getVectorNumElements()/2);
8714 ArrayRef<SDUse> Ops = Op->ops();
8715 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8716 Ops.slice(0, NumOperands/2));
8717 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8718 Ops.slice(NumOperands/2));
8719 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8722 // Otherwise, build it up through insert_subvectors.
8723 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8724 : DAG.getUNDEF(ResVT);
8726 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8727 unsigned NumSubElems = SubVT.getVectorNumElements();
8728 for (unsigned i = 0; i != NumOperands; ++i) {
8729 if ((NonZeros & (1 << i)) == 0)
8732 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8734 DAG.getIntPtrConstant(i * NumSubElems, dl));
8740 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8741 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8742 static bool isExpandWithZeros(const SDValue &Op) {
8743 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8744 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8746 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8747 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8753 // Returns true if the given node is a type promotion (by concatenating i1
8754 // zeros) of the result of a node that already zeros all upper bits of
8756 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8757 unsigned Opc = Op.getOpcode();
8759 assert(Opc == ISD::CONCAT_VECTORS &&
8760 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8761 "Unexpected node to check for type promotion!");
8763 // As long as we are concatenating zeros to the upper part of a previous node
8764 // result, climb up the tree until a node with different opcode is
8766 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8767 if (Opc == ISD::INSERT_SUBVECTOR) {
8768 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8769 Op.getConstantOperandVal(2) == 0)
8770 Op = Op.getOperand(1);
8773 } else { // Opc == ISD::CONCAT_VECTORS
8774 if (isExpandWithZeros(Op))
8775 Op = Op.getOperand(0);
8779 Opc = Op.getOpcode();
8782 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8783 // of a node that zeros the upper bits (its masked version).
8784 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8785 (Op.getOpcode() == ISD::AND &&
8786 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8787 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8794 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8795 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8796 const X86Subtarget &Subtarget,
8797 SelectionDAG & DAG) {
8799 MVT ResVT = Op.getSimpleValueType();
8800 unsigned NumOperands = Op.getNumOperands();
8802 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8803 "Unexpected number of operands in CONCAT_VECTORS");
8805 // If this node promotes - by concatenating zeroes - the type of the result
8806 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8807 // output register, mark it as legal and catch the pattern in instruction
8808 // selection to avoid emitting extra instructions (for zeroing upper bits).
8809 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8810 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8812 unsigned NumZero = 0;
8813 unsigned NumNonZero = 0;
8814 uint64_t NonZeros = 0;
8815 for (unsigned i = 0; i != NumOperands; ++i) {
8816 SDValue SubVec = Op.getOperand(i);
8817 if (SubVec.isUndef())
8819 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8822 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8823 NonZeros |= (uint64_t)1 << i;
8829 // If there are zero or one non-zeros we can handle this very simply.
8830 if (NumNonZero <= 1) {
8831 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8832 : DAG.getUNDEF(ResVT);
8835 unsigned Idx = countTrailingZeros(NonZeros);
8836 SDValue SubVec = Op.getOperand(Idx);
8837 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8838 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8839 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8842 if (NumOperands > 2) {
8843 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8844 ResVT.getVectorNumElements()/2);
8845 ArrayRef<SDUse> Ops = Op->ops();
8846 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8847 Ops.slice(0, NumOperands/2));
8848 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8849 Ops.slice(NumOperands/2));
8850 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8853 assert(NumNonZero == 2 && "Simple cases not handled?");
8855 if (ResVT.getVectorNumElements() >= 16)
8856 return Op; // The operation is legal with KUNPCK
8858 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8859 DAG.getUNDEF(ResVT), Op.getOperand(0),
8860 DAG.getIntPtrConstant(0, dl));
8861 unsigned NumElems = ResVT.getVectorNumElements();
8862 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8863 DAG.getIntPtrConstant(NumElems/2, dl));
8866 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8867 const X86Subtarget &Subtarget,
8868 SelectionDAG &DAG) {
8869 MVT VT = Op.getSimpleValueType();
8870 if (VT.getVectorElementType() == MVT::i1)
8871 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8873 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8874 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8875 Op.getNumOperands() == 4)));
8877 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8878 // from two other 128-bit ones.
8880 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8881 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8884 //===----------------------------------------------------------------------===//
8885 // Vector shuffle lowering
8887 // This is an experimental code path for lowering vector shuffles on x86. It is
8888 // designed to handle arbitrary vector shuffles and blends, gracefully
8889 // degrading performance as necessary. It works hard to recognize idiomatic
8890 // shuffles and lower them to optimal instruction patterns without leaving
8891 // a framework that allows reasonably efficient handling of all vector shuffle
8893 //===----------------------------------------------------------------------===//
8895 /// Tiny helper function to identify a no-op mask.
8897 /// This is a somewhat boring predicate function. It checks whether the mask
8898 /// array input, which is assumed to be a single-input shuffle mask of the kind
8899 /// used by the X86 shuffle instructions (not a fully general
8900 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8901 /// in-place shuffle are 'no-op's.
8902 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8903 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8904 assert(Mask[i] >= -1 && "Out of bound mask element!");
8905 if (Mask[i] >= 0 && Mask[i] != i)
8911 /// Test whether there are elements crossing 128-bit lanes in this
8914 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8915 /// and we routinely test for these.
8916 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8917 int LaneSize = 128 / VT.getScalarSizeInBits();
8918 int Size = Mask.size();
8919 for (int i = 0; i < Size; ++i)
8920 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8925 /// Test whether a shuffle mask is equivalent within each sub-lane.
8927 /// This checks a shuffle mask to see if it is performing the same
8928 /// lane-relative shuffle in each sub-lane. This trivially implies
8929 /// that it is also not lane-crossing. It may however involve a blend from the
8930 /// same lane of a second vector.
8932 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8933 /// non-trivial to compute in the face of undef lanes. The representation is
8934 /// suitable for use with existing 128-bit shuffles as entries from the second
8935 /// vector have been remapped to [LaneSize, 2*LaneSize).
8936 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8938 SmallVectorImpl<int> &RepeatedMask) {
8939 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8940 RepeatedMask.assign(LaneSize, -1);
8941 int Size = Mask.size();
8942 for (int i = 0; i < Size; ++i) {
8943 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8946 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8947 // This entry crosses lanes, so there is no way to model this shuffle.
8950 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8951 // Adjust second vector indices to start at LaneSize instead of Size.
8952 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8953 : Mask[i] % LaneSize + LaneSize;
8954 if (RepeatedMask[i % LaneSize] < 0)
8955 // This is the first non-undef entry in this slot of a 128-bit lane.
8956 RepeatedMask[i % LaneSize] = LocalM;
8957 else if (RepeatedMask[i % LaneSize] != LocalM)
8958 // Found a mismatch with the repeated mask.
8964 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8966 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8967 SmallVectorImpl<int> &RepeatedMask) {
8968 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8972 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
8973 SmallVector<int, 32> RepeatedMask;
8974 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8977 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8979 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8980 SmallVectorImpl<int> &RepeatedMask) {
8981 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8984 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8985 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8986 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8988 SmallVectorImpl<int> &RepeatedMask) {
8989 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8990 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8991 int Size = Mask.size();
8992 for (int i = 0; i < Size; ++i) {
8993 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8994 if (Mask[i] == SM_SentinelUndef)
8996 if (Mask[i] == SM_SentinelZero) {
8997 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8999 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9002 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9003 // This entry crosses lanes, so there is no way to model this shuffle.
9006 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9007 // Adjust second vector indices to start at LaneSize instead of Size.
9009 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
9010 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9011 // This is the first non-undef entry in this slot of a 128-bit lane.
9012 RepeatedMask[i % LaneSize] = LocalM;
9013 else if (RepeatedMask[i % LaneSize] != LocalM)
9014 // Found a mismatch with the repeated mask.
9020 /// Checks whether a shuffle mask is equivalent to an explicit list of
9023 /// This is a fast way to test a shuffle mask against a fixed pattern:
9025 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9027 /// It returns true if the mask is exactly as wide as the argument list, and
9028 /// each element of the mask is either -1 (signifying undef) or the value given
9029 /// in the argument.
9030 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
9031 ArrayRef<int> ExpectedMask) {
9032 if (Mask.size() != ExpectedMask.size())
9035 int Size = Mask.size();
9037 // If the values are build vectors, we can look through them to find
9038 // equivalent inputs that make the shuffles equivalent.
9039 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
9040 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
9042 for (int i = 0; i < Size; ++i) {
9043 assert(Mask[i] >= -1 && "Out of bound mask element!");
9044 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
9045 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9046 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9047 if (!MaskBV || !ExpectedBV ||
9048 MaskBV->getOperand(Mask[i] % Size) !=
9049 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9057 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9059 /// The masks must be exactly the same width.
9061 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9062 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9064 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9065 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9066 ArrayRef<int> ExpectedMask) {
9067 int Size = Mask.size();
9068 if (Size != (int)ExpectedMask.size())
9071 for (int i = 0; i < Size; ++i)
9072 if (Mask[i] == SM_SentinelUndef)
9074 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9076 else if (Mask[i] != ExpectedMask[i])
9082 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9084 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9085 const APInt &Zeroable) {
9086 int NumElts = Mask.size();
9087 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9089 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9090 for (int i = 0; i != NumElts; ++i) {
9092 if (M == SM_SentinelUndef)
9094 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9095 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9100 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9102 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9103 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9106 SmallVector<int, 8> Unpcklwd;
9107 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9108 /* Unary = */ false);
9109 SmallVector<int, 8> Unpckhwd;
9110 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9111 /* Unary = */ false);
9112 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9113 isTargetShuffleEquivalent(Mask, Unpckhwd));
9114 return IsUnpackwdMask;
9117 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9119 /// This helper function produces an 8-bit shuffle immediate corresponding to
9120 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9121 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9124 /// NB: We rely heavily on "undef" masks preserving the input lane.
9125 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9126 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9127 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9128 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9129 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9130 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9133 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9134 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9135 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9136 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9140 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9141 SelectionDAG &DAG) {
9142 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9145 /// Compute whether each element of a shuffle is zeroable.
9147 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9148 /// Either it is an undef element in the shuffle mask, the element of the input
9149 /// referenced is undef, or the element of the input referenced is known to be
9150 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9151 /// as many lanes with this technique as possible to simplify the remaining
9153 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9154 SDValue V1, SDValue V2) {
9155 APInt Zeroable(Mask.size(), 0);
9156 V1 = peekThroughBitcasts(V1);
9157 V2 = peekThroughBitcasts(V2);
9159 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9160 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9162 int VectorSizeInBits = V1.getValueSizeInBits();
9163 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9164 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9166 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9168 // Handle the easy cases.
9169 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9174 // Determine shuffle input and normalize the mask.
9175 SDValue V = M < Size ? V1 : V2;
9178 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9179 if (V.getOpcode() != ISD::BUILD_VECTOR)
9182 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9183 // the (larger) source element must be UNDEF/ZERO.
9184 if ((Size % V.getNumOperands()) == 0) {
9185 int Scale = Size / V->getNumOperands();
9186 SDValue Op = V.getOperand(M / Scale);
9187 if (Op.isUndef() || X86::isZeroNode(Op))
9189 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9190 APInt Val = Cst->getAPIntValue();
9191 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9192 Val = Val.getLoBits(ScalarSizeInBits);
9195 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9196 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9197 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9198 Val = Val.getLoBits(ScalarSizeInBits);
9205 // If the BUILD_VECTOR has more elements then all the (smaller) source
9206 // elements must be UNDEF or ZERO.
9207 if ((V.getNumOperands() % Size) == 0) {
9208 int Scale = V->getNumOperands() / Size;
9209 bool AllZeroable = true;
9210 for (int j = 0; j < Scale; ++j) {
9211 SDValue Op = V.getOperand((M * Scale) + j);
9212 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9223 // The Shuffle result is as follow:
9224 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9225 // Each Zeroable's element correspond to a particular Mask's element.
9226 // As described in computeZeroableShuffleElements function.
9228 // The function looks for a sub-mask that the nonzero elements are in
9229 // increasing order. If such sub-mask exist. The function returns true.
9230 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9231 ArrayRef<int> Mask, const EVT &VectorType,
9232 bool &IsZeroSideLeft) {
9233 int NextElement = -1;
9234 // Check if the Mask's nonzero elements are in increasing order.
9235 for (int i = 0, e = Mask.size(); i < e; i++) {
9236 // Checks if the mask's zeros elements are built from only zeros.
9237 assert(Mask[i] >= -1 && "Out of bound mask element!");
9242 // Find the lowest non zero element
9243 if (NextElement < 0) {
9244 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9245 IsZeroSideLeft = NextElement != 0;
9247 // Exit if the mask's non zero elements are not in increasing order.
9248 if (NextElement != Mask[i])
9255 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9256 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9257 ArrayRef<int> Mask, SDValue V1,
9259 const APInt &Zeroable,
9260 const X86Subtarget &Subtarget,
9261 SelectionDAG &DAG) {
9262 int Size = Mask.size();
9263 int LaneSize = 128 / VT.getScalarSizeInBits();
9264 const int NumBytes = VT.getSizeInBits() / 8;
9265 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9267 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9268 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9269 (Subtarget.hasBWI() && VT.is512BitVector()));
9271 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9272 // Sign bit set in i8 mask means zero element.
9273 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9276 for (int i = 0; i < NumBytes; ++i) {
9277 int M = Mask[i / NumEltBytes];
9279 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9282 if (Zeroable[i / NumEltBytes]) {
9283 PSHUFBMask[i] = ZeroMask;
9287 // We can only use a single input of V1 or V2.
9288 SDValue SrcV = (M >= Size ? V2 : V1);
9294 // PSHUFB can't cross lanes, ensure this doesn't happen.
9295 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9299 M = M * NumEltBytes + (i % NumEltBytes);
9300 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9302 assert(V && "Failed to find a source input");
9304 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9305 return DAG.getBitcast(
9306 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9307 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9310 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9311 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9314 // X86 has dedicated shuffle that can be lowered to VEXPAND
9315 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9316 const APInt &Zeroable,
9317 ArrayRef<int> Mask, SDValue &V1,
9318 SDValue &V2, SelectionDAG &DAG,
9319 const X86Subtarget &Subtarget) {
9320 bool IsLeftZeroSide = true;
9321 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9324 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9326 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9327 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9328 unsigned NumElts = VT.getVectorNumElements();
9329 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9330 "Unexpected number of vector elements");
9331 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9332 Subtarget, DAG, DL);
9333 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9334 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9335 return DAG.getSelect(DL, VT, VMask,
9336 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9340 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9341 unsigned &UnpackOpcode, bool IsUnary,
9342 ArrayRef<int> TargetMask,
9343 const SDLoc &DL, SelectionDAG &DAG,
9344 const X86Subtarget &Subtarget) {
9345 int NumElts = VT.getVectorNumElements();
9347 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9348 for (int i = 0; i != NumElts; i += 2) {
9349 int M1 = TargetMask[i + 0];
9350 int M2 = TargetMask[i + 1];
9351 Undef1 &= (SM_SentinelUndef == M1);
9352 Undef2 &= (SM_SentinelUndef == M2);
9353 Zero1 &= isUndefOrZero(M1);
9354 Zero2 &= isUndefOrZero(M2);
9356 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9357 "Zeroable shuffle detected");
9359 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9360 SmallVector<int, 64> Unpckl, Unpckh;
9361 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9362 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9363 UnpackOpcode = X86ISD::UNPCKL;
9364 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9365 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9369 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9370 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9371 UnpackOpcode = X86ISD::UNPCKH;
9372 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9373 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9377 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9378 if (IsUnary && (Zero1 || Zero2)) {
9379 // Don't bother if we can blend instead.
9380 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9381 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9384 bool MatchLo = true, MatchHi = true;
9385 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9386 int M = TargetMask[i];
9388 // Ignore if the input is known to be zero or the index is undef.
9389 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9390 (M == SM_SentinelUndef))
9393 MatchLo &= (M == Unpckl[i]);
9394 MatchHi &= (M == Unpckh[i]);
9397 if (MatchLo || MatchHi) {
9398 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9399 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9400 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9405 // If a binary shuffle, commute and try again.
9407 ShuffleVectorSDNode::commuteMask(Unpckl);
9408 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9409 UnpackOpcode = X86ISD::UNPCKL;
9414 ShuffleVectorSDNode::commuteMask(Unpckh);
9415 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9416 UnpackOpcode = X86ISD::UNPCKH;
9425 // X86 has dedicated unpack instructions that can handle specific blend
9426 // operations: UNPCKH and UNPCKL.
9427 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9428 ArrayRef<int> Mask, SDValue V1,
9429 SDValue V2, SelectionDAG &DAG) {
9430 SmallVector<int, 8> Unpckl;
9431 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9432 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9433 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9435 SmallVector<int, 8> Unpckh;
9436 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9437 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9438 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9440 // Commute and try again.
9441 ShuffleVectorSDNode::commuteMask(Unpckl);
9442 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9443 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9445 ShuffleVectorSDNode::commuteMask(Unpckh);
9446 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9447 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9452 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
9454 int Size = (int)Mask.size();
9455 int Split = Size / Delta;
9456 int TruncatedVectorStart = SwappedOps ? Size : 0;
9458 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
9459 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
9462 // The rest of the mask should not refer to the truncated vector's elements.
9463 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
9464 TruncatedVectorStart + Size))
9470 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9472 // An example is the following:
9474 // t0: ch = EntryToken
9475 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9476 // t25: v4i32 = truncate t2
9477 // t41: v8i16 = bitcast t25
9478 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9479 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9480 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9481 // t18: v2i64 = bitcast t51
9483 // Without avx512vl, this is lowered to:
9485 // vpmovqd %zmm0, %ymm0
9486 // vpshufb {{.*#+}} xmm0 =
9487 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
9489 // But when avx512vl is available, one can just use a single vpmovdw
9491 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
9492 MVT VT, SDValue V1, SDValue V2,
9494 const X86Subtarget &Subtarget) {
9495 if (VT != MVT::v16i8 && VT != MVT::v8i16)
9498 if (Mask.size() != VT.getVectorNumElements())
9501 bool SwappedOps = false;
9503 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
9504 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
9513 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
9514 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
9516 // and similar ones.
9517 if (V1.getOpcode() != ISD::BITCAST)
9519 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
9522 SDValue Src = V1.getOperand(0).getOperand(0);
9523 MVT SrcVT = Src.getSimpleValueType();
9525 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
9526 // are only available with avx512vl.
9527 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
9530 // Down Convert Word to Byte is only available with avx512bw. The case with
9531 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
9532 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
9533 !Subtarget.hasBWI())
9536 // The first half/quarter of the mask should refer to every second/fourth
9537 // element of the vector truncated and bitcasted.
9538 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
9539 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
9542 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
9545 // X86 has dedicated pack instructions that can handle specific truncation
9546 // operations: PACKSS and PACKUS.
9547 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9548 SDValue &V2, unsigned &PackOpcode,
9549 ArrayRef<int> TargetMask,
9551 const X86Subtarget &Subtarget) {
9552 unsigned NumElts = VT.getVectorNumElements();
9553 unsigned BitSize = VT.getScalarSizeInBits();
9554 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9555 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9557 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9558 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9559 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9560 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9561 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9562 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9563 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9567 PackOpcode = X86ISD::PACKUS;
9571 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9572 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9576 PackOpcode = X86ISD::PACKSS;
9582 // Try binary shuffle.
9583 SmallVector<int, 32> BinaryMask;
9584 createPackShuffleMask(VT, BinaryMask, false);
9585 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9586 if (MatchPACK(V1, V2))
9589 // Try unary shuffle.
9590 SmallVector<int, 32> UnaryMask;
9591 createPackShuffleMask(VT, UnaryMask, true);
9592 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9593 if (MatchPACK(V1, V1))
9599 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9600 ArrayRef<int> Mask, SDValue V1,
9601 SDValue V2, SelectionDAG &DAG,
9602 const X86Subtarget &Subtarget) {
9604 unsigned PackOpcode;
9605 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9607 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9608 DAG.getBitcast(PackVT, V2));
9613 /// Try to emit a bitmask instruction for a shuffle.
9615 /// This handles cases where we can model a blend exactly as a bitmask due to
9616 /// one of the inputs being zeroable.
9617 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9618 SDValue V2, ArrayRef<int> Mask,
9619 const APInt &Zeroable,
9620 SelectionDAG &DAG) {
9621 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9622 MVT EltVT = VT.getVectorElementType();
9623 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9624 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9625 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9627 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9630 if (Mask[i] % Size != i)
9631 return SDValue(); // Not a blend.
9633 V = Mask[i] < Size ? V1 : V2;
9634 else if (V != (Mask[i] < Size ? V1 : V2))
9635 return SDValue(); // Can only let one input through the mask.
9637 VMaskOps[i] = AllOnes;
9640 return SDValue(); // No non-zeroable elements!
9642 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9643 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9646 /// Try to emit a blend instruction for a shuffle using bit math.
9648 /// This is used as a fallback approach when first class blend instructions are
9649 /// unavailable. Currently it is only suitable for integer vectors, but could
9650 /// be generalized for floating point vectors if desirable.
9651 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9652 SDValue V2, ArrayRef<int> Mask,
9653 SelectionDAG &DAG) {
9654 assert(VT.isInteger() && "Only supports integer vector types!");
9655 MVT EltVT = VT.getVectorElementType();
9656 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9657 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9658 SmallVector<SDValue, 16> MaskOps;
9659 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9660 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9661 return SDValue(); // Shuffled input!
9662 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9665 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9666 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9667 // We have to cast V2 around.
9668 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9669 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9670 DAG.getBitcast(MaskVT, V1Mask),
9671 DAG.getBitcast(MaskVT, V2)));
9672 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9675 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9676 SDValue PreservedSrc,
9677 const X86Subtarget &Subtarget,
9680 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9681 MutableArrayRef<int> TargetMask,
9682 bool &ForceV1Zero, bool &ForceV2Zero,
9683 uint64_t &BlendMask) {
9684 bool V1IsZeroOrUndef =
9685 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9686 bool V2IsZeroOrUndef =
9687 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9690 ForceV1Zero = false, ForceV2Zero = false;
9691 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9693 // Attempt to generate the binary blend mask. If an input is zero then
9694 // we can use any lane.
9695 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9696 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9697 int M = TargetMask[i];
9698 if (M == SM_SentinelUndef)
9702 if (M == i + Size) {
9703 BlendMask |= 1ull << i;
9706 if (M == SM_SentinelZero) {
9707 if (V1IsZeroOrUndef) {
9712 if (V2IsZeroOrUndef) {
9714 BlendMask |= 1ull << i;
9715 TargetMask[i] = i + Size;
9724 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9726 uint64_t ScaledMask = 0;
9727 for (int i = 0; i != Size; ++i)
9728 if (BlendMask & (1ull << i))
9729 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9733 /// Try to emit a blend instruction for a shuffle.
9735 /// This doesn't do any checks for the availability of instructions for blending
9736 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9737 /// be matched in the backend with the type given. What it does check for is
9738 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9739 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9740 SDValue V2, ArrayRef<int> Original,
9741 const APInt &Zeroable,
9742 const X86Subtarget &Subtarget,
9743 SelectionDAG &DAG) {
9744 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9746 uint64_t BlendMask = 0;
9747 bool ForceV1Zero = false, ForceV2Zero = false;
9748 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9752 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9754 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9756 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9758 switch (VT.SimpleTy) {
9763 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9764 DAG.getConstant(BlendMask, DL, MVT::i8));
9768 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9772 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9773 // that instruction.
9774 if (Subtarget.hasAVX2()) {
9775 // Scale the blend by the number of 32-bit dwords per element.
9776 int Scale = VT.getScalarSizeInBits() / 32;
9777 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9778 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9779 V1 = DAG.getBitcast(BlendVT, V1);
9780 V2 = DAG.getBitcast(BlendVT, V2);
9781 return DAG.getBitcast(
9782 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9783 DAG.getConstant(BlendMask, DL, MVT::i8)));
9787 // For integer shuffles we need to expand the mask and cast the inputs to
9788 // v8i16s prior to blending.
9789 int Scale = 8 / VT.getVectorNumElements();
9790 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9791 V1 = DAG.getBitcast(MVT::v8i16, V1);
9792 V2 = DAG.getBitcast(MVT::v8i16, V2);
9793 return DAG.getBitcast(VT,
9794 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9795 DAG.getConstant(BlendMask, DL, MVT::i8)));
9799 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9800 SmallVector<int, 8> RepeatedMask;
9801 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9802 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9803 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9805 for (int i = 0; i < 8; ++i)
9806 if (RepeatedMask[i] >= 8)
9807 BlendMask |= 1ull << i;
9808 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9809 DAG.getConstant(BlendMask, DL, MVT::i8));
9815 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9816 "256-bit byte-blends require AVX2 support!");
9818 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9820 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9821 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9822 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9825 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9826 if (SDValue Masked =
9827 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9830 // Scale the blend by the number of bytes per element.
9831 int Scale = VT.getScalarSizeInBits() / 8;
9833 // This form of blend is always done on bytes. Compute the byte vector
9835 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9837 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9838 // mix of LLVM's code generator and the x86 backend. We tell the code
9839 // generator that boolean values in the elements of an x86 vector register
9840 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9841 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9842 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9843 // of the element (the remaining are ignored) and 0 in that high bit would
9844 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9845 // the LLVM model for boolean values in vector elements gets the relevant
9846 // bit set, it is set backwards and over constrained relative to x86's
9848 SmallVector<SDValue, 32> VSELECTMask;
9849 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9850 for (int j = 0; j < Scale; ++j)
9851 VSELECTMask.push_back(
9852 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9853 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9856 V1 = DAG.getBitcast(BlendVT, V1);
9857 V2 = DAG.getBitcast(BlendVT, V2);
9858 return DAG.getBitcast(
9860 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9870 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9871 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9872 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9875 llvm_unreachable("Not a supported integer vector type!");
9879 /// Try to lower as a blend of elements from two inputs followed by
9880 /// a single-input permutation.
9882 /// This matches the pattern where we can blend elements from two inputs and
9883 /// then reduce the shuffle to a single-input permutation.
9884 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9885 SDValue V1, SDValue V2,
9887 SelectionDAG &DAG) {
9888 // We build up the blend mask while checking whether a blend is a viable way
9889 // to reduce the shuffle.
9890 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9891 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9893 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9897 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9899 if (BlendMask[Mask[i] % Size] < 0)
9900 BlendMask[Mask[i] % Size] = Mask[i];
9901 else if (BlendMask[Mask[i] % Size] != Mask[i])
9902 return SDValue(); // Can't blend in the needed input!
9904 PermuteMask[i] = Mask[i] % Size;
9907 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9908 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9911 /// Generic routine to decompose a shuffle and blend into independent
9912 /// blends and permutes.
9914 /// This matches the extremely common pattern for handling combined
9915 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9916 /// operations. It will try to pick the best arrangement of shuffles and
9918 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9922 SelectionDAG &DAG) {
9923 // Shuffle the input elements into the desired positions in V1 and V2 and
9924 // blend them together.
9925 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9926 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9927 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9928 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9929 if (Mask[i] >= 0 && Mask[i] < Size) {
9930 V1Mask[i] = Mask[i];
9932 } else if (Mask[i] >= Size) {
9933 V2Mask[i] = Mask[i] - Size;
9934 BlendMask[i] = i + Size;
9937 // Try to lower with the simpler initial blend strategy unless one of the
9938 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9939 // shuffle may be able to fold with a load or other benefit. However, when
9940 // we'll have to do 2x as many shuffles in order to achieve this, blending
9941 // first is a better strategy.
9942 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9943 if (SDValue BlendPerm =
9944 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9947 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9948 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9949 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9952 /// Try to lower a vector shuffle as a rotation.
9954 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9955 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9956 ArrayRef<int> Mask) {
9957 int NumElts = Mask.size();
9959 // We need to detect various ways of spelling a rotation:
9960 // [11, 12, 13, 14, 15, 0, 1, 2]
9961 // [-1, 12, 13, 14, -1, -1, 1, -1]
9962 // [-1, -1, -1, -1, -1, -1, 1, 2]
9963 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9964 // [-1, 4, 5, 6, -1, -1, 9, -1]
9965 // [-1, 4, 5, 6, -1, -1, -1, -1]
9968 for (int i = 0; i < NumElts; ++i) {
9970 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9971 "Unexpected mask index.");
9975 // Determine where a rotated vector would have started.
9976 int StartIdx = i - (M % NumElts);
9978 // The identity rotation isn't interesting, stop.
9981 // If we found the tail of a vector the rotation must be the missing
9982 // front. If we found the head of a vector, it must be how much of the
9984 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9987 Rotation = CandidateRotation;
9988 else if (Rotation != CandidateRotation)
9989 // The rotations don't match, so we can't match this mask.
9992 // Compute which value this mask is pointing at.
9993 SDValue MaskV = M < NumElts ? V1 : V2;
9995 // Compute which of the two target values this index should be assigned
9996 // to. This reflects whether the high elements are remaining or the low
9997 // elements are remaining.
9998 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
10000 // Either set up this value if we've not encountered it before, or check
10001 // that it remains consistent.
10004 else if (TargetV != MaskV)
10005 // This may be a rotation, but it pulls from the inputs in some
10006 // unsupported interleaving.
10010 // Check that we successfully analyzed the mask, and normalize the results.
10011 assert(Rotation != 0 && "Failed to locate a viable rotation!");
10012 assert((Lo || Hi) && "Failed to find a rotated input vector!");
10024 /// Try to lower a vector shuffle as a byte rotation.
10026 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
10027 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
10028 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
10029 /// try to generically lower a vector shuffle through such an pattern. It
10030 /// does not check for the profitability of lowering either as PALIGNR or
10031 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
10032 /// This matches shuffle vectors that look like:
10034 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
10036 /// Essentially it concatenates V1 and V2, shifts right by some number of
10037 /// elements, and takes the low elements as the result. Note that while this is
10038 /// specified as a *right shift* because x86 is little-endian, it is a *left
10039 /// rotate* of the vector lanes.
10040 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
10041 ArrayRef<int> Mask) {
10042 // Don't accept any shuffles with zero elements.
10043 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10046 // PALIGNR works on 128-bit lanes.
10047 SmallVector<int, 16> RepeatedMask;
10048 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10051 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10055 // PALIGNR rotates bytes, so we need to scale the
10056 // rotation based on how many bytes are in the vector lane.
10057 int NumElts = RepeatedMask.size();
10058 int Scale = 16 / NumElts;
10059 return Rotation * Scale;
10062 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10063 SDValue V1, SDValue V2,
10064 ArrayRef<int> Mask,
10065 const X86Subtarget &Subtarget,
10066 SelectionDAG &DAG) {
10067 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10069 SDValue Lo = V1, Hi = V2;
10070 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10071 if (ByteRotation <= 0)
10074 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10076 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10077 Lo = DAG.getBitcast(ByteVT, Lo);
10078 Hi = DAG.getBitcast(ByteVT, Hi);
10080 // SSSE3 targets can use the palignr instruction.
10081 if (Subtarget.hasSSSE3()) {
10082 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10083 "512-bit PALIGNR requires BWI instructions");
10084 return DAG.getBitcast(
10085 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10086 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10089 assert(VT.is128BitVector() &&
10090 "Rotate-based lowering only supports 128-bit lowering!");
10091 assert(Mask.size() <= 16 &&
10092 "Can shuffle at most 16 bytes in a 128-bit vector!");
10093 assert(ByteVT == MVT::v16i8 &&
10094 "SSE2 rotate lowering only needed for v16i8!");
10096 // Default SSE2 implementation
10097 int LoByteShift = 16 - ByteRotation;
10098 int HiByteShift = ByteRotation;
10100 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10101 DAG.getConstant(LoByteShift, DL, MVT::i8));
10102 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10103 DAG.getConstant(HiByteShift, DL, MVT::i8));
10104 return DAG.getBitcast(VT,
10105 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10108 /// Try to lower a vector shuffle as a dword/qword rotation.
10110 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10111 /// rotation of the concatenation of two vectors; This routine will
10112 /// try to generically lower a vector shuffle through such an pattern.
10114 /// Essentially it concatenates V1 and V2, shifts right by some number of
10115 /// elements, and takes the low elements as the result. Note that while this is
10116 /// specified as a *right shift* because x86 is little-endian, it is a *left
10117 /// rotate* of the vector lanes.
10118 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10119 SDValue V1, SDValue V2,
10120 ArrayRef<int> Mask,
10121 const X86Subtarget &Subtarget,
10122 SelectionDAG &DAG) {
10123 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10124 "Only 32-bit and 64-bit elements are supported!");
10126 // 128/256-bit vectors are only supported with VLX.
10127 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10128 && "VLX required for 128/256-bit vectors");
10130 SDValue Lo = V1, Hi = V2;
10131 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10135 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10136 DAG.getConstant(Rotation, DL, MVT::i8));
10139 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10141 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10142 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10143 /// matches elements from one of the input vectors shuffled to the left or
10144 /// right with zeroable elements 'shifted in'. It handles both the strictly
10145 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10146 /// quad word lane.
10148 /// PSHL : (little-endian) left bit shift.
10149 /// [ zz, 0, zz, 2 ]
10150 /// [ -1, 4, zz, -1 ]
10151 /// PSRL : (little-endian) right bit shift.
10152 /// [ 1, zz, 3, zz]
10153 /// [ -1, -1, 7, zz]
10154 /// PSLLDQ : (little-endian) left byte shift
10155 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10156 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10157 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10158 /// PSRLDQ : (little-endian) right byte shift
10159 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10160 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10161 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10162 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10163 unsigned ScalarSizeInBits,
10164 ArrayRef<int> Mask, int MaskOffset,
10165 const APInt &Zeroable,
10166 const X86Subtarget &Subtarget) {
10167 int Size = Mask.size();
10168 unsigned SizeInBits = Size * ScalarSizeInBits;
10170 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10171 for (int i = 0; i < Size; i += Scale)
10172 for (int j = 0; j < Shift; ++j)
10173 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10179 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10180 for (int i = 0; i != Size; i += Scale) {
10181 unsigned Pos = Left ? i + Shift : i;
10182 unsigned Low = Left ? i : i + Shift;
10183 unsigned Len = Scale - Shift;
10184 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10188 int ShiftEltBits = ScalarSizeInBits * Scale;
10189 bool ByteShift = ShiftEltBits > 64;
10190 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10191 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10192 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10194 // Normalize the scale for byte shifts to still produce an i64 element
10196 Scale = ByteShift ? Scale / 2 : Scale;
10198 // We need to round trip through the appropriate type for the shift.
10199 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10200 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10201 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10202 return (int)ShiftAmt;
10205 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10206 // keep doubling the size of the integer elements up to that. We can
10207 // then shift the elements of the integer vector by whole multiples of
10208 // their width within the elements of the larger integer vector. Test each
10209 // multiple to see if we can find a match with the moved element indices
10210 // and that the shifted in elements are all zeroable.
10211 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10212 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10213 for (int Shift = 1; Shift != Scale; ++Shift)
10214 for (bool Left : {true, false})
10215 if (CheckZeros(Shift, Scale, Left)) {
10216 int ShiftAmt = MatchShift(Shift, Scale, Left);
10225 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10226 SDValue V2, ArrayRef<int> Mask,
10227 const APInt &Zeroable,
10228 const X86Subtarget &Subtarget,
10229 SelectionDAG &DAG) {
10230 int Size = Mask.size();
10231 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10237 // Try to match shuffle against V1 shift.
10238 int ShiftAmt = matchVectorShuffleAsShift(
10239 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10241 // If V1 failed, try to match shuffle against V2 shift.
10242 if (ShiftAmt < 0) {
10244 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10245 Mask, Size, Zeroable, Subtarget);
10252 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10253 "Illegal integer vector type");
10254 V = DAG.getBitcast(ShiftVT, V);
10255 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10256 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10257 return DAG.getBitcast(VT, V);
10260 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10261 // Remainder of lower half result is zero and upper half is all undef.
10262 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10263 ArrayRef<int> Mask, uint64_t &BitLen,
10264 uint64_t &BitIdx, const APInt &Zeroable) {
10265 int Size = Mask.size();
10266 int HalfSize = Size / 2;
10267 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10268 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10270 // Upper half must be undefined.
10271 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10274 // Determine the extraction length from the part of the
10275 // lower half that isn't zeroable.
10276 int Len = HalfSize;
10277 for (; Len > 0; --Len)
10278 if (!Zeroable[Len - 1])
10280 assert(Len > 0 && "Zeroable shuffle mask");
10282 // Attempt to match first Len sequential elements from the lower half.
10285 for (int i = 0; i != Len; ++i) {
10287 if (M == SM_SentinelUndef)
10289 SDValue &V = (M < Size ? V1 : V2);
10292 // The extracted elements must start at a valid index and all mask
10293 // elements must be in the lower half.
10294 if (i > M || M >= HalfSize)
10297 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10305 if (!Src || Idx < 0)
10308 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10309 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10310 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10315 // INSERTQ: Extract lowest Len elements from lower half of second source and
10316 // insert over first source, starting at Idx.
10317 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10318 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10319 ArrayRef<int> Mask, uint64_t &BitLen,
10320 uint64_t &BitIdx) {
10321 int Size = Mask.size();
10322 int HalfSize = Size / 2;
10323 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10325 // Upper half must be undefined.
10326 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10329 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10332 // Attempt to match first source from mask before insertion point.
10333 if (isUndefInRange(Mask, 0, Idx)) {
10335 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10337 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10343 // Extend the extraction length looking to match both the insertion of
10344 // the second source and the remaining elements of the first.
10345 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10347 int Len = Hi - Idx;
10349 // Match insertion.
10350 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10352 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10358 // Match the remaining elements of the lower half.
10359 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10361 } else if ((!Base || (Base == V1)) &&
10362 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10364 } else if ((!Base || (Base == V2)) &&
10365 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10372 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10373 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10383 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10384 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10385 SDValue V2, ArrayRef<int> Mask,
10386 const APInt &Zeroable,
10387 SelectionDAG &DAG) {
10388 uint64_t BitLen, BitIdx;
10389 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10390 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10391 DAG.getConstant(BitLen, DL, MVT::i8),
10392 DAG.getConstant(BitIdx, DL, MVT::i8));
10394 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10395 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10396 V2 ? V2 : DAG.getUNDEF(VT),
10397 DAG.getConstant(BitLen, DL, MVT::i8),
10398 DAG.getConstant(BitIdx, DL, MVT::i8));
10403 /// Lower a vector shuffle as a zero or any extension.
10405 /// Given a specific number of elements, element bit width, and extension
10406 /// stride, produce either a zero or any extension based on the available
10407 /// features of the subtarget. The extended elements are consecutive and
10408 /// begin and can start from an offsetted element index in the input; to
10409 /// avoid excess shuffling the offset must either being in the bottom lane
10410 /// or at the start of a higher lane. All extended elements must be from
10412 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10413 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10414 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10415 assert(Scale > 1 && "Need a scale to extend.");
10416 int EltBits = VT.getScalarSizeInBits();
10417 int NumElements = VT.getVectorNumElements();
10418 int NumEltsPerLane = 128 / EltBits;
10419 int OffsetLane = Offset / NumEltsPerLane;
10420 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10421 "Only 8, 16, and 32 bit elements can be extended.");
10422 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10423 assert(0 <= Offset && "Extension offset must be positive.");
10424 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10425 "Extension offset must be in the first lane or start an upper lane.");
10427 // Check that an index is in same lane as the base offset.
10428 auto SafeOffset = [&](int Idx) {
10429 return OffsetLane == (Idx / NumEltsPerLane);
10432 // Shift along an input so that the offset base moves to the first element.
10433 auto ShuffleOffset = [&](SDValue V) {
10437 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10438 for (int i = 0; i * Scale < NumElements; ++i) {
10439 int SrcIdx = i + Offset;
10440 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10442 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10445 // Found a valid zext mask! Try various lowering strategies based on the
10446 // input type and available ISA extensions.
10447 if (Subtarget.hasSSE41()) {
10448 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10449 // PUNPCK will catch this in a later shuffle match.
10450 if (Offset && Scale == 2 && VT.is128BitVector())
10452 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10453 NumElements / Scale);
10454 InputV = ShuffleOffset(InputV);
10455 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10456 return DAG.getBitcast(VT, InputV);
10459 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10461 // For any extends we can cheat for larger element sizes and use shuffle
10462 // instructions that can fold with a load and/or copy.
10463 if (AnyExt && EltBits == 32) {
10464 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10466 return DAG.getBitcast(
10467 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10468 DAG.getBitcast(MVT::v4i32, InputV),
10469 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10471 if (AnyExt && EltBits == 16 && Scale > 2) {
10472 int PSHUFDMask[4] = {Offset / 2, -1,
10473 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10474 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10475 DAG.getBitcast(MVT::v4i32, InputV),
10476 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10477 int PSHUFWMask[4] = {1, -1, -1, -1};
10478 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10479 return DAG.getBitcast(
10480 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10481 DAG.getBitcast(MVT::v8i16, InputV),
10482 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10485 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10487 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10488 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10489 assert(VT.is128BitVector() && "Unexpected vector width!");
10491 int LoIdx = Offset * EltBits;
10492 SDValue Lo = DAG.getBitcast(
10493 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10494 DAG.getConstant(EltBits, DL, MVT::i8),
10495 DAG.getConstant(LoIdx, DL, MVT::i8)));
10497 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10498 !SafeOffset(Offset + 1))
10499 return DAG.getBitcast(VT, Lo);
10501 int HiIdx = (Offset + 1) * EltBits;
10502 SDValue Hi = DAG.getBitcast(
10503 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10504 DAG.getConstant(EltBits, DL, MVT::i8),
10505 DAG.getConstant(HiIdx, DL, MVT::i8)));
10506 return DAG.getBitcast(VT,
10507 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10510 // If this would require more than 2 unpack instructions to expand, use
10511 // pshufb when available. We can only use more than 2 unpack instructions
10512 // when zero extending i8 elements which also makes it easier to use pshufb.
10513 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10514 assert(NumElements == 16 && "Unexpected byte vector width!");
10515 SDValue PSHUFBMask[16];
10516 for (int i = 0; i < 16; ++i) {
10517 int Idx = Offset + (i / Scale);
10518 PSHUFBMask[i] = DAG.getConstant(
10519 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10521 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10522 return DAG.getBitcast(
10523 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10524 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10527 // If we are extending from an offset, ensure we start on a boundary that
10528 // we can unpack from.
10529 int AlignToUnpack = Offset % (NumElements / Scale);
10530 if (AlignToUnpack) {
10531 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10532 for (int i = AlignToUnpack; i < NumElements; ++i)
10533 ShMask[i - AlignToUnpack] = i;
10534 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10535 Offset -= AlignToUnpack;
10538 // Otherwise emit a sequence of unpacks.
10540 unsigned UnpackLoHi = X86ISD::UNPCKL;
10541 if (Offset >= (NumElements / 2)) {
10542 UnpackLoHi = X86ISD::UNPCKH;
10543 Offset -= (NumElements / 2);
10546 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10547 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10548 : getZeroVector(InputVT, Subtarget, DAG, DL);
10549 InputV = DAG.getBitcast(InputVT, InputV);
10550 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10554 } while (Scale > 1);
10555 return DAG.getBitcast(VT, InputV);
10558 /// Try to lower a vector shuffle as a zero extension on any microarch.
10560 /// This routine will try to do everything in its power to cleverly lower
10561 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10562 /// check for the profitability of this lowering, it tries to aggressively
10563 /// match this pattern. It will use all of the micro-architectural details it
10564 /// can to emit an efficient lowering. It handles both blends with all-zero
10565 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10566 /// masking out later).
10568 /// The reason we have dedicated lowering for zext-style shuffles is that they
10569 /// are both incredibly common and often quite performance sensitive.
10570 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10571 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10572 const APInt &Zeroable, const X86Subtarget &Subtarget,
10573 SelectionDAG &DAG) {
10574 int Bits = VT.getSizeInBits();
10575 int NumLanes = Bits / 128;
10576 int NumElements = VT.getVectorNumElements();
10577 int NumEltsPerLane = NumElements / NumLanes;
10578 assert(VT.getScalarSizeInBits() <= 32 &&
10579 "Exceeds 32-bit integer zero extension limit");
10580 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10582 // Define a helper function to check a particular ext-scale and lower to it if
10584 auto Lower = [&](int Scale) -> SDValue {
10586 bool AnyExt = true;
10589 for (int i = 0; i < NumElements; ++i) {
10592 continue; // Valid anywhere but doesn't tell us anything.
10593 if (i % Scale != 0) {
10594 // Each of the extended elements need to be zeroable.
10598 // We no longer are in the anyext case.
10603 // Each of the base elements needs to be consecutive indices into the
10604 // same input vector.
10605 SDValue V = M < NumElements ? V1 : V2;
10606 M = M % NumElements;
10609 Offset = M - (i / Scale);
10610 } else if (InputV != V)
10611 return SDValue(); // Flip-flopping inputs.
10613 // Offset must start in the lowest 128-bit lane or at the start of an
10615 // FIXME: Is it ever worth allowing a negative base offset?
10616 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10617 (Offset % NumEltsPerLane) == 0))
10620 // If we are offsetting, all referenced entries must come from the same
10622 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10625 if ((M % NumElements) != (Offset + (i / Scale)))
10626 return SDValue(); // Non-consecutive strided elements.
10630 // If we fail to find an input, we have a zero-shuffle which should always
10631 // have already been handled.
10632 // FIXME: Maybe handle this here in case during blending we end up with one?
10636 // If we are offsetting, don't extend if we only match a single input, we
10637 // can always do better by using a basic PSHUF or PUNPCK.
10638 if (Offset != 0 && Matches < 2)
10641 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10642 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10645 // The widest scale possible for extending is to a 64-bit integer.
10646 assert(Bits % 64 == 0 &&
10647 "The number of bits in a vector must be divisible by 64 on x86!");
10648 int NumExtElements = Bits / 64;
10650 // Each iteration, try extending the elements half as much, but into twice as
10652 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10653 assert(NumElements % NumExtElements == 0 &&
10654 "The input vector size must be divisible by the extended size.");
10655 if (SDValue V = Lower(NumElements / NumExtElements))
10659 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10663 // Returns one of the source operands if the shuffle can be reduced to a
10664 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10665 auto CanZExtLowHalf = [&]() {
10666 for (int i = NumElements / 2; i != NumElements; ++i)
10669 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10671 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10676 if (SDValue V = CanZExtLowHalf()) {
10677 V = DAG.getBitcast(MVT::v2i64, V);
10678 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10679 return DAG.getBitcast(VT, V);
10682 // No viable ext lowering found.
10686 /// Try to get a scalar value for a specific element of a vector.
10688 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10689 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10690 SelectionDAG &DAG) {
10691 MVT VT = V.getSimpleValueType();
10692 MVT EltVT = VT.getVectorElementType();
10693 V = peekThroughBitcasts(V);
10695 // If the bitcasts shift the element size, we can't extract an equivalent
10696 // element from it.
10697 MVT NewVT = V.getSimpleValueType();
10698 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10701 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10702 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10703 // Ensure the scalar operand is the same size as the destination.
10704 // FIXME: Add support for scalar truncation where possible.
10705 SDValue S = V.getOperand(Idx);
10706 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10707 return DAG.getBitcast(EltVT, S);
10713 /// Helper to test for a load that can be folded with x86 shuffles.
10715 /// This is particularly important because the set of instructions varies
10716 /// significantly based on whether the operand is a load or not.
10717 static bool isShuffleFoldableLoad(SDValue V) {
10718 V = peekThroughBitcasts(V);
10719 return ISD::isNON_EXTLoad(V.getNode());
10722 /// Try to lower insertion of a single element into a zero vector.
10724 /// This is a common pattern that we have especially efficient patterns to lower
10725 /// across all subtarget feature sets.
10726 static SDValue lowerVectorShuffleAsElementInsertion(
10727 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10728 const APInt &Zeroable, const X86Subtarget &Subtarget,
10729 SelectionDAG &DAG) {
10731 MVT EltVT = VT.getVectorElementType();
10734 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10736 bool IsV1Zeroable = true;
10737 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10738 if (i != V2Index && !Zeroable[i]) {
10739 IsV1Zeroable = false;
10743 // Check for a single input from a SCALAR_TO_VECTOR node.
10744 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10745 // all the smarts here sunk into that routine. However, the current
10746 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10747 // vector shuffle lowering is dead.
10748 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10750 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10751 // We need to zext the scalar if it is smaller than an i32.
10752 V2S = DAG.getBitcast(EltVT, V2S);
10753 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10754 // Using zext to expand a narrow element won't work for non-zero
10759 // Zero-extend directly to i32.
10760 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10761 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10763 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10764 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10765 EltVT == MVT::i16) {
10766 // Either not inserting from the low element of the input or the input
10767 // element size is too small to use VZEXT_MOVL to clear the high bits.
10771 if (!IsV1Zeroable) {
10772 // If V1 can't be treated as a zero vector we have fewer options to lower
10773 // this. We can't support integer vectors or non-zero targets cheaply, and
10774 // the V1 elements can't be permuted in any way.
10775 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10776 if (!VT.isFloatingPoint() || V2Index != 0)
10778 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10779 V1Mask[V2Index] = -1;
10780 if (!isNoopShuffleMask(V1Mask))
10782 if (!VT.is128BitVector())
10785 // Otherwise, use MOVSD or MOVSS.
10786 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10787 "Only two types of floating point element types to handle!");
10788 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10792 // This lowering only works for the low element with floating point vectors.
10793 if (VT.isFloatingPoint() && V2Index != 0)
10796 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10798 V2 = DAG.getBitcast(VT, V2);
10800 if (V2Index != 0) {
10801 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10802 // the desired position. Otherwise it is more efficient to do a vector
10803 // shift left. We know that we can do a vector shift left because all
10804 // the inputs are zero.
10805 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10806 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10807 V2Shuffle[V2Index] = 0;
10808 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10810 V2 = DAG.getBitcast(MVT::v16i8, V2);
10812 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10813 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
10814 V2 = DAG.getBitcast(VT, V2);
10820 /// Try to lower broadcast of a single - truncated - integer element,
10821 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10823 /// This assumes we have AVX2.
10824 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10825 SDValue V0, int BroadcastIdx,
10826 const X86Subtarget &Subtarget,
10827 SelectionDAG &DAG) {
10828 assert(Subtarget.hasAVX2() &&
10829 "We can only lower integer broadcasts with AVX2!");
10831 EVT EltVT = VT.getVectorElementType();
10832 EVT V0VT = V0.getValueType();
10834 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10835 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10837 EVT V0EltVT = V0VT.getVectorElementType();
10838 if (!V0EltVT.isInteger())
10841 const unsigned EltSize = EltVT.getSizeInBits();
10842 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10844 // This is only a truncation if the original element type is larger.
10845 if (V0EltSize <= EltSize)
10848 assert(((V0EltSize % EltSize) == 0) &&
10849 "Scalar type sizes must all be powers of 2 on x86!");
10851 const unsigned V0Opc = V0.getOpcode();
10852 const unsigned Scale = V0EltSize / EltSize;
10853 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10855 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10856 V0Opc != ISD::BUILD_VECTOR)
10859 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10861 // If we're extracting non-least-significant bits, shift so we can truncate.
10862 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10863 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10864 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10865 if (const int OffsetIdx = BroadcastIdx % Scale)
10866 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10867 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
10869 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10870 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10873 /// Try to lower broadcast of a single element.
10875 /// For convenience, this code also bundles all of the subtarget feature set
10876 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10877 /// a convenient way to factor it out.
10878 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10879 SDValue V1, SDValue V2,
10880 ArrayRef<int> Mask,
10881 const X86Subtarget &Subtarget,
10882 SelectionDAG &DAG) {
10883 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10884 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10885 (Subtarget.hasAVX2() && VT.isInteger())))
10888 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10889 // we can only broadcast from a register with AVX2.
10890 unsigned NumElts = Mask.size();
10891 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10893 : X86ISD::VBROADCAST;
10894 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10896 // Check that the mask is a broadcast.
10897 int BroadcastIdx = -1;
10898 for (int i = 0; i != (int)NumElts; ++i) {
10899 SmallVector<int, 8> BroadcastMask(NumElts, i);
10900 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10906 if (BroadcastIdx < 0)
10908 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10909 "a sorted mask where the broadcast "
10912 // Go up the chain of (vector) values to find a scalar load that we can
10913 // combine with the broadcast.
10916 switch (V.getOpcode()) {
10917 case ISD::BITCAST: {
10918 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10919 SDValue VSrc = V.getOperand(0);
10920 unsigned NumEltBits = V.getScalarValueSizeInBits();
10921 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10922 if ((NumEltBits % NumSrcBits) == 0)
10923 BroadcastIdx *= (NumEltBits / NumSrcBits);
10924 else if ((NumSrcBits % NumEltBits) == 0 &&
10925 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10926 BroadcastIdx /= (NumSrcBits / NumEltBits);
10932 case ISD::CONCAT_VECTORS: {
10933 int OperandSize = Mask.size() / V.getNumOperands();
10934 V = V.getOperand(BroadcastIdx / OperandSize);
10935 BroadcastIdx %= OperandSize;
10938 case ISD::INSERT_SUBVECTOR: {
10939 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10940 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10944 int BeginIdx = (int)ConstantIdx->getZExtValue();
10946 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10947 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10948 BroadcastIdx -= BeginIdx;
10959 // Ensure the source vector and BroadcastIdx are for a suitable type.
10960 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10961 unsigned NumEltBits = VT.getScalarSizeInBits();
10962 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10963 if ((NumSrcBits % NumEltBits) == 0)
10964 BroadcastIdx *= (NumSrcBits / NumEltBits);
10965 else if ((NumEltBits % NumSrcBits) == 0 &&
10966 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10967 BroadcastIdx /= (NumEltBits / NumSrcBits);
10971 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10972 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10973 V = DAG.getBitcast(SrcVT, V);
10976 // Check if this is a broadcast of a scalar. We special case lowering
10977 // for scalars so that we can more effectively fold with loads.
10978 // First, look through bitcast: if the original value has a larger element
10979 // type than the shuffle, the broadcast element is in essence truncated.
10980 // Make that explicit to ease folding.
10981 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10982 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10983 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10984 return TruncBroadcast;
10986 MVT BroadcastVT = VT;
10988 // Peek through any bitcast (only useful for loads).
10989 SDValue BC = peekThroughBitcasts(V);
10991 // Also check the simpler case, where we can directly reuse the scalar.
10992 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10993 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10994 V = V.getOperand(BroadcastIdx);
10996 // If we can't broadcast from a register, check that the input is a load.
10997 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10999 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
11000 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11001 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
11002 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
11003 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
11008 // If we are broadcasting a load that is only used by the shuffle
11009 // then we can reduce the vector load to the broadcasted scalar load.
11010 LoadSDNode *Ld = cast<LoadSDNode>(BC);
11011 SDValue BaseAddr = Ld->getOperand(1);
11012 EVT SVT = BroadcastVT.getScalarType();
11013 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
11014 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
11015 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
11016 DAG.getMachineFunction().getMachineMemOperand(
11017 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
11018 DAG.makeEquivalentMemoryOrdering(Ld, V);
11019 } else if (!BroadcastFromReg) {
11020 // We can't broadcast from a vector register.
11022 } else if (BroadcastIdx != 0) {
11023 // We can only broadcast from the zero-element of a vector register,
11024 // but it can be advantageous to broadcast from the zero-element of a
11026 if (!VT.is256BitVector() && !VT.is512BitVector())
11029 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
11030 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11033 // Only broadcast the zero-element of a 128-bit subvector.
11034 unsigned EltSize = VT.getScalarSizeInBits();
11035 if (((BroadcastIdx * EltSize) % 128) != 0)
11038 // The shuffle input might have been a bitcast we looked through; look at
11039 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
11040 // later bitcast it to BroadcastVT.
11041 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11042 "Unexpected vector element size");
11043 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
11044 "Unexpected vector size");
11045 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11048 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11049 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11050 DAG.getBitcast(MVT::f64, V));
11052 // Bitcast back to the same scalar type as BroadcastVT.
11053 MVT SrcVT = V.getSimpleValueType();
11054 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11055 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11056 "Unexpected vector element size");
11057 if (SrcVT.isVector()) {
11058 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11059 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11061 SrcVT = BroadcastVT.getScalarType();
11063 V = DAG.getBitcast(SrcVT, V);
11066 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11067 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11068 V = DAG.getBitcast(MVT::f64, V);
11069 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11070 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11073 // We only support broadcasting from 128-bit vectors to minimize the
11074 // number of patterns we need to deal with in isel. So extract down to
11075 // 128-bits, removing as many bitcasts as possible.
11076 if (SrcVT.getSizeInBits() > 128) {
11077 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11078 128 / SrcVT.getScalarSizeInBits());
11079 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11080 V = DAG.getBitcast(ExtVT, V);
11083 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11086 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11087 // INSERTPS when the V1 elements are already in the correct locations
11088 // because otherwise we can just always use two SHUFPS instructions which
11089 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11090 // perform INSERTPS if a single V1 element is out of place and all V2
11091 // elements are zeroable.
11092 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11093 unsigned &InsertPSMask,
11094 const APInt &Zeroable,
11095 ArrayRef<int> Mask,
11096 SelectionDAG &DAG) {
11097 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11098 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11099 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11101 // Attempt to match INSERTPS with one element from VA or VB being
11102 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11104 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11105 ArrayRef<int> CandidateMask) {
11106 unsigned ZMask = 0;
11107 int VADstIndex = -1;
11108 int VBDstIndex = -1;
11109 bool VAUsedInPlace = false;
11111 for (int i = 0; i < 4; ++i) {
11112 // Synthesize a zero mask from the zeroable elements (includes undefs).
11118 // Flag if we use any VA inputs in place.
11119 if (i == CandidateMask[i]) {
11120 VAUsedInPlace = true;
11124 // We can only insert a single non-zeroable element.
11125 if (VADstIndex >= 0 || VBDstIndex >= 0)
11128 if (CandidateMask[i] < 4) {
11129 // VA input out of place for insertion.
11132 // VB input for insertion.
11137 // Don't bother if we have no (non-zeroable) element for insertion.
11138 if (VADstIndex < 0 && VBDstIndex < 0)
11141 // Determine element insertion src/dst indices. The src index is from the
11142 // start of the inserted vector, not the start of the concatenated vector.
11143 unsigned VBSrcIndex = 0;
11144 if (VADstIndex >= 0) {
11145 // If we have a VA input out of place, we use VA as the V2 element
11146 // insertion and don't use the original V2 at all.
11147 VBSrcIndex = CandidateMask[VADstIndex];
11148 VBDstIndex = VADstIndex;
11151 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11154 // If no V1 inputs are used in place, then the result is created only from
11155 // the zero mask and the V2 insertion - so remove V1 dependency.
11156 if (!VAUsedInPlace)
11157 VA = DAG.getUNDEF(MVT::v4f32);
11159 // Update V1, V2 and InsertPSMask accordingly.
11163 // Insert the V2 element into the desired position.
11164 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11165 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11169 if (matchAsInsertPS(V1, V2, Mask))
11172 // Commute and try again.
11173 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11174 ShuffleVectorSDNode::commuteMask(CommutedMask);
11175 if (matchAsInsertPS(V2, V1, CommutedMask))
11181 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11182 SDValue V2, ArrayRef<int> Mask,
11183 const APInt &Zeroable,
11184 SelectionDAG &DAG) {
11185 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11186 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11188 // Attempt to match the insertps pattern.
11189 unsigned InsertPSMask;
11190 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11193 // Insert the V2 element into the desired position.
11194 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11195 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11198 /// Try to lower a shuffle as a permute of the inputs followed by an
11199 /// UNPCK instruction.
11201 /// This specifically targets cases where we end up with alternating between
11202 /// the two inputs, and so can permute them into something that feeds a single
11203 /// UNPCK instruction. Note that this routine only targets integer vectors
11204 /// because for floating point vectors we have a generalized SHUFPS lowering
11205 /// strategy that handles everything that doesn't *exactly* match an unpack,
11206 /// making this clever lowering unnecessary.
11207 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11208 SDValue V1, SDValue V2,
11209 ArrayRef<int> Mask,
11210 SelectionDAG &DAG) {
11211 assert(!VT.isFloatingPoint() &&
11212 "This routine only supports integer vectors.");
11213 assert(VT.is128BitVector() &&
11214 "This routine only works on 128-bit vectors.");
11215 assert(!V2.isUndef() &&
11216 "This routine should only be used when blending two inputs.");
11217 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11219 int Size = Mask.size();
11222 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11224 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11226 bool UnpackLo = NumLoInputs >= NumHiInputs;
11228 auto TryUnpack = [&](int ScalarSize, int Scale) {
11229 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11230 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11232 for (int i = 0; i < Size; ++i) {
11236 // Each element of the unpack contains Scale elements from this mask.
11237 int UnpackIdx = i / Scale;
11239 // We only handle the case where V1 feeds the first slots of the unpack.
11240 // We rely on canonicalization to ensure this is the case.
11241 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11244 // Setup the mask for this input. The indexing is tricky as we have to
11245 // handle the unpack stride.
11246 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11247 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11251 // If we will have to shuffle both inputs to use the unpack, check whether
11252 // we can just unpack first and shuffle the result. If so, skip this unpack.
11253 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11254 !isNoopShuffleMask(V2Mask))
11257 // Shuffle the inputs into place.
11258 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11259 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11261 // Cast the inputs to the type we will use to unpack them.
11262 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11263 V1 = DAG.getBitcast(UnpackVT, V1);
11264 V2 = DAG.getBitcast(UnpackVT, V2);
11266 // Unpack the inputs and cast the result back to the desired type.
11267 return DAG.getBitcast(
11268 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11269 UnpackVT, V1, V2));
11272 // We try each unpack from the largest to the smallest to try and find one
11273 // that fits this mask.
11274 int OrigScalarSize = VT.getScalarSizeInBits();
11275 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11276 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11279 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11281 if (NumLoInputs == 0 || NumHiInputs == 0) {
11282 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11283 "We have to have *some* inputs!");
11284 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11286 // FIXME: We could consider the total complexity of the permute of each
11287 // possible unpacking. Or at the least we should consider how many
11288 // half-crossings are created.
11289 // FIXME: We could consider commuting the unpacks.
11291 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11292 for (int i = 0; i < Size; ++i) {
11296 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11299 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11301 return DAG.getVectorShuffle(
11302 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11304 DAG.getUNDEF(VT), PermMask);
11310 /// Handle lowering of 2-lane 64-bit floating point shuffles.
11312 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11313 /// support for floating point shuffles but not integer shuffles. These
11314 /// instructions will incur a domain crossing penalty on some chips though so
11315 /// it is better to avoid lowering through this for integer vectors where
11317 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11318 const APInt &Zeroable,
11319 SDValue V1, SDValue V2,
11320 const X86Subtarget &Subtarget,
11321 SelectionDAG &DAG) {
11322 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11323 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11324 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11326 if (V2.isUndef()) {
11327 // Check for being able to broadcast a single element.
11328 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11329 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11332 // Straight shuffle of a single input vector. Simulate this by using the
11333 // single input as both of the "inputs" to this instruction..
11334 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11336 if (Subtarget.hasAVX()) {
11337 // If we have AVX, we can use VPERMILPS which will allow folding a load
11338 // into the shuffle.
11339 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11340 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11343 return DAG.getNode(
11344 X86ISD::SHUFP, DL, MVT::v2f64,
11345 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11346 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11347 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11349 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11350 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11351 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11352 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11354 // When loading a scalar and then shuffling it into a vector we can often do
11355 // the insertion cheaply.
11356 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11357 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11359 // Try inverting the insertion since for v2 masks it is easy to do and we
11360 // can't reliably sort the mask one way or the other.
11361 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11362 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11363 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11364 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11367 // Try to use one of the special instruction patterns to handle two common
11368 // blend patterns if a zero-blend above didn't work.
11369 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11370 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11371 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11372 // We can either use a special instruction to load over the low double or
11373 // to move just the low double.
11374 return DAG.getNode(
11375 X86ISD::MOVSD, DL, MVT::v2f64, V2,
11376 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11378 if (Subtarget.hasSSE41())
11379 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11380 Zeroable, Subtarget, DAG))
11383 // Use dedicated unpack instructions for masks that match their pattern.
11385 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11388 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11389 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11390 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11393 /// Handle lowering of 2-lane 64-bit integer shuffles.
11395 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11396 /// the integer unit to minimize domain crossing penalties. However, for blends
11397 /// it falls back to the floating point shuffle operation with appropriate bit
11399 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11400 const APInt &Zeroable,
11401 SDValue V1, SDValue V2,
11402 const X86Subtarget &Subtarget,
11403 SelectionDAG &DAG) {
11404 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11405 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11406 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11408 if (V2.isUndef()) {
11409 // Check for being able to broadcast a single element.
11410 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11411 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11414 // Straight shuffle of a single input vector. For everything from SSE2
11415 // onward this has a single fast instruction with no scary immediates.
11416 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11417 V1 = DAG.getBitcast(MVT::v4i32, V1);
11418 int WidenedMask[4] = {
11419 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11420 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11421 return DAG.getBitcast(
11423 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11424 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11426 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11427 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11428 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11429 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11431 // Try to use shift instructions.
11432 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11433 Zeroable, Subtarget, DAG))
11436 // When loading a scalar and then shuffling it into a vector we can often do
11437 // the insertion cheaply.
11438 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11439 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11441 // Try inverting the insertion since for v2 masks it is easy to do and we
11442 // can't reliably sort the mask one way or the other.
11443 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11444 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11445 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11448 // We have different paths for blend lowering, but they all must use the
11449 // *exact* same predicate.
11450 bool IsBlendSupported = Subtarget.hasSSE41();
11451 if (IsBlendSupported)
11452 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11453 Zeroable, Subtarget, DAG))
11456 // Use dedicated unpack instructions for masks that match their pattern.
11458 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11461 // Try to use byte rotation instructions.
11462 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11463 if (Subtarget.hasSSSE3()) {
11464 if (Subtarget.hasVLX())
11465 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11466 Mask, Subtarget, DAG))
11469 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11470 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11474 // If we have direct support for blends, we should lower by decomposing into
11475 // a permute. That will be faster than the domain cross.
11476 if (IsBlendSupported)
11477 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11480 // We implement this with SHUFPD which is pretty lame because it will likely
11481 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11482 // However, all the alternatives are still more cycles and newer chips don't
11483 // have this problem. It would be really nice if x86 had better shuffles here.
11484 V1 = DAG.getBitcast(MVT::v2f64, V1);
11485 V2 = DAG.getBitcast(MVT::v2f64, V2);
11486 return DAG.getBitcast(MVT::v2i64,
11487 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11490 /// Test whether this can be lowered with a single SHUFPS instruction.
11492 /// This is used to disable more specialized lowerings when the shufps lowering
11493 /// will happen to be efficient.
11494 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11495 // This routine only handles 128-bit shufps.
11496 assert(Mask.size() == 4 && "Unsupported mask size!");
11497 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11498 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11499 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11500 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11502 // To lower with a single SHUFPS we need to have the low half and high half
11503 // each requiring a single input.
11504 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11506 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11512 /// Lower a vector shuffle using the SHUFPS instruction.
11514 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11515 /// It makes no assumptions about whether this is the *best* lowering, it simply
11517 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11518 ArrayRef<int> Mask, SDValue V1,
11519 SDValue V2, SelectionDAG &DAG) {
11520 SDValue LowV = V1, HighV = V2;
11521 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11523 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11525 if (NumV2Elements == 1) {
11526 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11528 // Compute the index adjacent to V2Index and in the same half by toggling
11530 int V2AdjIndex = V2Index ^ 1;
11532 if (Mask[V2AdjIndex] < 0) {
11533 // Handles all the cases where we have a single V2 element and an undef.
11534 // This will only ever happen in the high lanes because we commute the
11535 // vector otherwise.
11537 std::swap(LowV, HighV);
11538 NewMask[V2Index] -= 4;
11540 // Handle the case where the V2 element ends up adjacent to a V1 element.
11541 // To make this work, blend them together as the first step.
11542 int V1Index = V2AdjIndex;
11543 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11544 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11545 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11547 // Now proceed to reconstruct the final blend as we have the necessary
11548 // high or low half formed.
11555 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11556 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11558 } else if (NumV2Elements == 2) {
11559 if (Mask[0] < 4 && Mask[1] < 4) {
11560 // Handle the easy case where we have V1 in the low lanes and V2 in the
11564 } else if (Mask[2] < 4 && Mask[3] < 4) {
11565 // We also handle the reversed case because this utility may get called
11566 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11567 // arrange things in the right direction.
11573 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11574 // trying to place elements directly, just blend them and set up the final
11575 // shuffle to place them.
11577 // The first two blend mask elements are for V1, the second two are for
11579 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11580 Mask[2] < 4 ? Mask[2] : Mask[3],
11581 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11582 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11583 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11584 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11586 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11589 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11590 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11591 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11592 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11595 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11596 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11599 /// Lower 4-lane 32-bit floating point shuffles.
11601 /// Uses instructions exclusively from the floating point unit to minimize
11602 /// domain crossing penalties, as these are sufficient to implement all v4f32
11604 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11605 const APInt &Zeroable,
11606 SDValue V1, SDValue V2,
11607 const X86Subtarget &Subtarget,
11608 SelectionDAG &DAG) {
11609 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11610 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11611 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11613 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11615 if (NumV2Elements == 0) {
11616 // Check for being able to broadcast a single element.
11617 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11618 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11621 // Use even/odd duplicate instructions for masks that match their pattern.
11622 if (Subtarget.hasSSE3()) {
11623 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11624 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11625 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11626 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11629 if (Subtarget.hasAVX()) {
11630 // If we have AVX, we can use VPERMILPS which will allow folding a load
11631 // into the shuffle.
11632 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11633 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11636 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11637 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11638 if (!Subtarget.hasSSE2()) {
11639 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11640 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11641 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11642 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11645 // Otherwise, use a straight shuffle of a single input vector. We pass the
11646 // input vector to both operands to simulate this with a SHUFPS.
11647 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11648 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11651 // There are special ways we can lower some single-element blends. However, we
11652 // have custom ways we can lower more complex single-element blends below that
11653 // we defer to if both this and BLENDPS fail to match, so restrict this to
11654 // when the V2 input is targeting element 0 of the mask -- that is the fast
11656 if (NumV2Elements == 1 && Mask[0] >= 4)
11657 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11658 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11661 if (Subtarget.hasSSE41()) {
11662 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11663 Zeroable, Subtarget, DAG))
11666 // Use INSERTPS if we can complete the shuffle efficiently.
11668 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11671 if (!isSingleSHUFPSMask(Mask))
11672 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11673 DL, MVT::v4f32, V1, V2, Mask, DAG))
11677 // Use low/high mov instructions. These are only valid in SSE1 because
11678 // otherwise they are widened to v2f64 and never get here.
11679 if (!Subtarget.hasSSE2()) {
11680 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11681 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11682 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11683 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11686 // Use dedicated unpack instructions for masks that match their pattern.
11688 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11691 // Otherwise fall back to a SHUFPS lowering strategy.
11692 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11695 /// Lower 4-lane i32 vector shuffles.
11697 /// We try to handle these with integer-domain shuffles where we can, but for
11698 /// blends we use the floating point domain blend instructions.
11699 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11700 const APInt &Zeroable,
11701 SDValue V1, SDValue V2,
11702 const X86Subtarget &Subtarget,
11703 SelectionDAG &DAG) {
11704 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11705 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11706 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11708 // Whenever we can lower this as a zext, that instruction is strictly faster
11709 // than any alternative. It also allows us to fold memory operands into the
11710 // shuffle in many cases.
11711 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11712 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11715 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11717 if (NumV2Elements == 0) {
11718 // Check for being able to broadcast a single element.
11719 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11720 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11723 // Straight shuffle of a single input vector. For everything from SSE2
11724 // onward this has a single fast instruction with no scary immediates.
11725 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11726 // but we aren't actually going to use the UNPCK instruction because doing
11727 // so prevents folding a load into this instruction or making a copy.
11728 const int UnpackLoMask[] = {0, 0, 1, 1};
11729 const int UnpackHiMask[] = {2, 2, 3, 3};
11730 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11731 Mask = UnpackLoMask;
11732 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11733 Mask = UnpackHiMask;
11735 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11736 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11739 // Try to use shift instructions.
11740 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11741 Zeroable, Subtarget, DAG))
11744 // There are special ways we can lower some single-element blends.
11745 if (NumV2Elements == 1)
11746 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11747 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11750 // We have different paths for blend lowering, but they all must use the
11751 // *exact* same predicate.
11752 bool IsBlendSupported = Subtarget.hasSSE41();
11753 if (IsBlendSupported)
11754 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11755 Zeroable, Subtarget, DAG))
11758 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11762 // Use dedicated unpack instructions for masks that match their pattern.
11764 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11767 // Try to use byte rotation instructions.
11768 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11769 if (Subtarget.hasSSSE3()) {
11770 if (Subtarget.hasVLX())
11771 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11772 Mask, Subtarget, DAG))
11775 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11776 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11780 // Assume that a single SHUFPS is faster than an alternative sequence of
11781 // multiple instructions (even if the CPU has a domain penalty).
11782 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11783 if (!isSingleSHUFPSMask(Mask)) {
11784 // If we have direct support for blends, we should lower by decomposing into
11785 // a permute. That will be faster than the domain cross.
11786 if (IsBlendSupported)
11787 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11790 // Try to lower by permuting the inputs into an unpack instruction.
11791 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11792 DL, MVT::v4i32, V1, V2, Mask, DAG))
11796 // We implement this with SHUFPS because it can blend from two vectors.
11797 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11798 // up the inputs, bypassing domain shift penalties that we would incur if we
11799 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11801 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11802 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11803 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11804 return DAG.getBitcast(MVT::v4i32, ShufPS);
11807 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11808 /// shuffle lowering, and the most complex part.
11810 /// The lowering strategy is to try to form pairs of input lanes which are
11811 /// targeted at the same half of the final vector, and then use a dword shuffle
11812 /// to place them onto the right half, and finally unpack the paired lanes into
11813 /// their final position.
11815 /// The exact breakdown of how to form these dword pairs and align them on the
11816 /// correct sides is really tricky. See the comments within the function for
11817 /// more of the details.
11819 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11820 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11821 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11822 /// vector, form the analogous 128-bit 8-element Mask.
11823 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11824 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11825 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11826 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11827 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11829 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11830 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11831 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11833 // Attempt to directly match PSHUFLW or PSHUFHW.
11834 if (isUndefOrInRange(LoMask, 0, 4) &&
11835 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11836 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11837 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11839 if (isUndefOrInRange(HiMask, 4, 8) &&
11840 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11841 for (int i = 0; i != 4; ++i)
11842 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11843 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11844 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11847 SmallVector<int, 4> LoInputs;
11848 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11849 array_pod_sort(LoInputs.begin(), LoInputs.end());
11850 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11851 SmallVector<int, 4> HiInputs;
11852 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11853 array_pod_sort(HiInputs.begin(), HiInputs.end());
11854 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11856 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11857 int NumHToL = LoInputs.size() - NumLToL;
11859 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11860 int NumHToH = HiInputs.size() - NumLToH;
11861 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11862 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11863 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11864 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11866 // If we are shuffling values from one half - check how many different DWORD
11867 // pairs we need to create. If only 1 or 2 then we can perform this as a
11868 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11869 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11870 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11871 V = DAG.getNode(ShufWOp, DL, VT, V,
11872 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11873 V = DAG.getBitcast(PSHUFDVT, V);
11874 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11875 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11876 return DAG.getBitcast(VT, V);
11879 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11880 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11881 SmallVector<std::pair<int, int>, 4> DWordPairs;
11882 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11884 // Collect the different DWORD pairs.
11885 for (int DWord = 0; DWord != 4; ++DWord) {
11886 int M0 = Mask[2 * DWord + 0];
11887 int M1 = Mask[2 * DWord + 1];
11888 M0 = (M0 >= 0 ? M0 % 4 : M0);
11889 M1 = (M1 >= 0 ? M1 % 4 : M1);
11890 if (M0 < 0 && M1 < 0)
11893 bool Match = false;
11894 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11895 auto &DWordPair = DWordPairs[j];
11896 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11897 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11898 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11899 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11900 PSHUFDMask[DWord] = DOffset + j;
11906 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11907 DWordPairs.push_back(std::make_pair(M0, M1));
11911 if (DWordPairs.size() <= 2) {
11912 DWordPairs.resize(2, std::make_pair(-1, -1));
11913 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11914 DWordPairs[1].first, DWordPairs[1].second};
11915 if ((NumHToL + NumHToH) == 0)
11916 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11917 if ((NumLToL + NumLToH) == 0)
11918 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11922 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11923 // such inputs we can swap two of the dwords across the half mark and end up
11924 // with <=2 inputs to each half in each half. Once there, we can fall through
11925 // to the generic code below. For example:
11927 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11928 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11930 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11931 // and an existing 2-into-2 on the other half. In this case we may have to
11932 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11933 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11934 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11935 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11936 // half than the one we target for fixing) will be fixed when we re-enter this
11937 // path. We will also combine away any sequence of PSHUFD instructions that
11938 // result into a single instruction. Here is an example of the tricky case:
11940 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11941 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11943 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11945 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11946 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11948 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11949 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11951 // The result is fine to be handled by the generic logic.
11952 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11953 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11954 int AOffset, int BOffset) {
11955 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11956 "Must call this with A having 3 or 1 inputs from the A half.");
11957 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11958 "Must call this with B having 1 or 3 inputs from the B half.");
11959 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11960 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11962 bool ThreeAInputs = AToAInputs.size() == 3;
11964 // Compute the index of dword with only one word among the three inputs in
11965 // a half by taking the sum of the half with three inputs and subtracting
11966 // the sum of the actual three inputs. The difference is the remaining
11968 int ADWord, BDWord;
11969 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11970 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11971 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11972 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11973 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11974 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11975 int TripleNonInputIdx =
11976 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11977 TripleDWord = TripleNonInputIdx / 2;
11979 // We use xor with one to compute the adjacent DWord to whichever one the
11981 OneInputDWord = (OneInput / 2) ^ 1;
11983 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11984 // and BToA inputs. If there is also such a problem with the BToB and AToB
11985 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11986 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11987 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11988 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11989 // Compute how many inputs will be flipped by swapping these DWords. We
11991 // to balance this to ensure we don't form a 3-1 shuffle in the other
11993 int NumFlippedAToBInputs =
11994 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11995 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11996 int NumFlippedBToBInputs =
11997 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11998 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11999 if ((NumFlippedAToBInputs == 1 &&
12000 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
12001 (NumFlippedBToBInputs == 1 &&
12002 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
12003 // We choose whether to fix the A half or B half based on whether that
12004 // half has zero flipped inputs. At zero, we may not be able to fix it
12005 // with that half. We also bias towards fixing the B half because that
12006 // will more commonly be the high half, and we have to bias one way.
12007 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
12008 ArrayRef<int> Inputs) {
12009 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
12010 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
12011 // Determine whether the free index is in the flipped dword or the
12012 // unflipped dword based on where the pinned index is. We use this bit
12013 // in an xor to conditionally select the adjacent dword.
12014 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
12015 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12016 if (IsFixIdxInput == IsFixFreeIdxInput)
12018 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12019 assert(IsFixIdxInput != IsFixFreeIdxInput &&
12020 "We need to be changing the number of flipped inputs!");
12021 int PSHUFHalfMask[] = {0, 1, 2, 3};
12022 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
12024 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
12025 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
12026 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12028 for (int &M : Mask)
12029 if (M >= 0 && M == FixIdx)
12031 else if (M >= 0 && M == FixFreeIdx)
12034 if (NumFlippedBToBInputs != 0) {
12036 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
12037 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
12039 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
12040 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
12041 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12046 int PSHUFDMask[] = {0, 1, 2, 3};
12047 PSHUFDMask[ADWord] = BDWord;
12048 PSHUFDMask[BDWord] = ADWord;
12049 V = DAG.getBitcast(
12051 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12052 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12054 // Adjust the mask to match the new locations of A and B.
12055 for (int &M : Mask)
12056 if (M >= 0 && M/2 == ADWord)
12057 M = 2 * BDWord + M % 2;
12058 else if (M >= 0 && M/2 == BDWord)
12059 M = 2 * ADWord + M % 2;
12061 // Recurse back into this routine to re-compute state now that this isn't
12062 // a 3 and 1 problem.
12063 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12066 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12067 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12068 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12069 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12071 // At this point there are at most two inputs to the low and high halves from
12072 // each half. That means the inputs can always be grouped into dwords and
12073 // those dwords can then be moved to the correct half with a dword shuffle.
12074 // We use at most one low and one high word shuffle to collect these paired
12075 // inputs into dwords, and finally a dword shuffle to place them.
12076 int PSHUFLMask[4] = {-1, -1, -1, -1};
12077 int PSHUFHMask[4] = {-1, -1, -1, -1};
12078 int PSHUFDMask[4] = {-1, -1, -1, -1};
12080 // First fix the masks for all the inputs that are staying in their
12081 // original halves. This will then dictate the targets of the cross-half
12083 auto fixInPlaceInputs =
12084 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12085 MutableArrayRef<int> SourceHalfMask,
12086 MutableArrayRef<int> HalfMask, int HalfOffset) {
12087 if (InPlaceInputs.empty())
12089 if (InPlaceInputs.size() == 1) {
12090 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12091 InPlaceInputs[0] - HalfOffset;
12092 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12095 if (IncomingInputs.empty()) {
12096 // Just fix all of the in place inputs.
12097 for (int Input : InPlaceInputs) {
12098 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12099 PSHUFDMask[Input / 2] = Input / 2;
12104 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12105 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12106 InPlaceInputs[0] - HalfOffset;
12107 // Put the second input next to the first so that they are packed into
12108 // a dword. We find the adjacent index by toggling the low bit.
12109 int AdjIndex = InPlaceInputs[0] ^ 1;
12110 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12111 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12112 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12114 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12115 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12117 // Now gather the cross-half inputs and place them into a free dword of
12118 // their target half.
12119 // FIXME: This operation could almost certainly be simplified dramatically to
12120 // look more like the 3-1 fixing operation.
12121 auto moveInputsToRightHalf = [&PSHUFDMask](
12122 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12123 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12124 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12126 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12127 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12129 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12131 int LowWord = Word & ~1;
12132 int HighWord = Word | 1;
12133 return isWordClobbered(SourceHalfMask, LowWord) ||
12134 isWordClobbered(SourceHalfMask, HighWord);
12137 if (IncomingInputs.empty())
12140 if (ExistingInputs.empty()) {
12141 // Map any dwords with inputs from them into the right half.
12142 for (int Input : IncomingInputs) {
12143 // If the source half mask maps over the inputs, turn those into
12144 // swaps and use the swapped lane.
12145 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12146 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12147 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12148 Input - SourceOffset;
12149 // We have to swap the uses in our half mask in one sweep.
12150 for (int &M : HalfMask)
12151 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12153 else if (M == Input)
12154 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12156 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12157 Input - SourceOffset &&
12158 "Previous placement doesn't match!");
12160 // Note that this correctly re-maps both when we do a swap and when
12161 // we observe the other side of the swap above. We rely on that to
12162 // avoid swapping the members of the input list directly.
12163 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12166 // Map the input's dword into the correct half.
12167 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12168 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12170 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12172 "Previous placement doesn't match!");
12175 // And just directly shift any other-half mask elements to be same-half
12176 // as we will have mirrored the dword containing the element into the
12177 // same position within that half.
12178 for (int &M : HalfMask)
12179 if (M >= SourceOffset && M < SourceOffset + 4) {
12180 M = M - SourceOffset + DestOffset;
12181 assert(M >= 0 && "This should never wrap below zero!");
12186 // Ensure we have the input in a viable dword of its current half. This
12187 // is particularly tricky because the original position may be clobbered
12188 // by inputs being moved and *staying* in that half.
12189 if (IncomingInputs.size() == 1) {
12190 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12191 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12193 SourceHalfMask[InputFixed - SourceOffset] =
12194 IncomingInputs[0] - SourceOffset;
12195 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12197 IncomingInputs[0] = InputFixed;
12199 } else if (IncomingInputs.size() == 2) {
12200 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12201 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12202 // We have two non-adjacent or clobbered inputs we need to extract from
12203 // the source half. To do this, we need to map them into some adjacent
12204 // dword slot in the source mask.
12205 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12206 IncomingInputs[1] - SourceOffset};
12208 // If there is a free slot in the source half mask adjacent to one of
12209 // the inputs, place the other input in it. We use (Index XOR 1) to
12210 // compute an adjacent index.
12211 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12212 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12213 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12214 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12215 InputsFixed[1] = InputsFixed[0] ^ 1;
12216 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12217 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12218 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12219 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12220 InputsFixed[0] = InputsFixed[1] ^ 1;
12221 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12222 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12223 // The two inputs are in the same DWord but it is clobbered and the
12224 // adjacent DWord isn't used at all. Move both inputs to the free
12226 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12227 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12228 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12229 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12231 // The only way we hit this point is if there is no clobbering
12232 // (because there are no off-half inputs to this half) and there is no
12233 // free slot adjacent to one of the inputs. In this case, we have to
12234 // swap an input with a non-input.
12235 for (int i = 0; i < 4; ++i)
12236 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12237 "We can't handle any clobbers here!");
12238 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12239 "Cannot have adjacent inputs here!");
12241 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12242 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12244 // We also have to update the final source mask in this case because
12245 // it may need to undo the above swap.
12246 for (int &M : FinalSourceHalfMask)
12247 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12248 M = InputsFixed[1] + SourceOffset;
12249 else if (M == InputsFixed[1] + SourceOffset)
12250 M = (InputsFixed[0] ^ 1) + SourceOffset;
12252 InputsFixed[1] = InputsFixed[0] ^ 1;
12255 // Point everything at the fixed inputs.
12256 for (int &M : HalfMask)
12257 if (M == IncomingInputs[0])
12258 M = InputsFixed[0] + SourceOffset;
12259 else if (M == IncomingInputs[1])
12260 M = InputsFixed[1] + SourceOffset;
12262 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12263 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12266 llvm_unreachable("Unhandled input size!");
12269 // Now hoist the DWord down to the right half.
12270 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12271 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12272 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12273 for (int &M : HalfMask)
12274 for (int Input : IncomingInputs)
12276 M = FreeDWord * 2 + Input % 2;
12278 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12279 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12280 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12281 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12283 // Now enact all the shuffles we've computed to move the inputs into their
12285 if (!isNoopShuffleMask(PSHUFLMask))
12286 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12287 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12288 if (!isNoopShuffleMask(PSHUFHMask))
12289 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12290 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12291 if (!isNoopShuffleMask(PSHUFDMask))
12292 V = DAG.getBitcast(
12294 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12295 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12297 // At this point, each half should contain all its inputs, and we can then
12298 // just shuffle them into their final position.
12299 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12300 "Failed to lift all the high half inputs to the low mask!");
12301 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12302 "Failed to lift all the low half inputs to the high mask!");
12304 // Do a half shuffle for the low mask.
12305 if (!isNoopShuffleMask(LoMask))
12306 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12307 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12309 // Do a half shuffle with the high mask after shifting its values down.
12310 for (int &M : HiMask)
12313 if (!isNoopShuffleMask(HiMask))
12314 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12315 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12320 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12321 /// blend if only one input is used.
12322 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12323 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12324 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12326 SDValue V1Mask[16];
12327 SDValue V2Mask[16];
12331 int Size = Mask.size();
12332 int Scale = 16 / Size;
12333 for (int i = 0; i < 16; ++i) {
12334 if (Mask[i / Scale] < 0) {
12335 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12337 const int ZeroMask = 0x80;
12338 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12340 int V2Idx = Mask[i / Scale] < Size
12342 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12343 if (Zeroable[i / Scale])
12344 V1Idx = V2Idx = ZeroMask;
12345 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12346 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12347 V1InUse |= (ZeroMask != V1Idx);
12348 V2InUse |= (ZeroMask != V2Idx);
12353 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12354 DAG.getBitcast(MVT::v16i8, V1),
12355 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12357 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12358 DAG.getBitcast(MVT::v16i8, V2),
12359 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12361 // If we need shuffled inputs from both, blend the two.
12363 if (V1InUse && V2InUse)
12364 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12366 V = V1InUse ? V1 : V2;
12368 // Cast the result back to the correct type.
12369 return DAG.getBitcast(VT, V);
12372 /// Generic lowering of 8-lane i16 shuffles.
12374 /// This handles both single-input shuffles and combined shuffle/blends with
12375 /// two inputs. The single input shuffles are immediately delegated to
12376 /// a dedicated lowering routine.
12378 /// The blends are lowered in one of three fundamental ways. If there are few
12379 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12380 /// of the input is significantly cheaper when lowered as an interleaving of
12381 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12382 /// halves of the inputs separately (making them have relatively few inputs)
12383 /// and then concatenate them.
12384 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12385 const APInt &Zeroable,
12386 SDValue V1, SDValue V2,
12387 const X86Subtarget &Subtarget,
12388 SelectionDAG &DAG) {
12389 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12390 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12391 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12393 // Whenever we can lower this as a zext, that instruction is strictly faster
12394 // than any alternative.
12395 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12396 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12399 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12401 if (NumV2Inputs == 0) {
12402 // Check for being able to broadcast a single element.
12403 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12404 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12407 // Try to use shift instructions.
12408 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12409 Zeroable, Subtarget, DAG))
12412 // Use dedicated unpack instructions for masks that match their pattern.
12414 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12417 // Use dedicated pack instructions for masks that match their pattern.
12418 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12422 // Try to use byte rotation instructions.
12423 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12424 Mask, Subtarget, DAG))
12427 // Make a copy of the mask so it can be modified.
12428 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12429 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12430 MutableMask, Subtarget,
12434 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12435 "All single-input shuffles should be canonicalized to be V1-input "
12438 // Try to use shift instructions.
12439 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12440 Zeroable, Subtarget, DAG))
12443 // See if we can use SSE4A Extraction / Insertion.
12444 if (Subtarget.hasSSE4A())
12445 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12449 // There are special ways we can lower some single-element blends.
12450 if (NumV2Inputs == 1)
12451 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12452 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12455 // We have different paths for blend lowering, but they all must use the
12456 // *exact* same predicate.
12457 bool IsBlendSupported = Subtarget.hasSSE41();
12458 if (IsBlendSupported)
12459 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12460 Zeroable, Subtarget, DAG))
12463 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12467 // Use dedicated unpack instructions for masks that match their pattern.
12469 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12472 // Use dedicated pack instructions for masks that match their pattern.
12473 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12477 // Try to use byte rotation instructions.
12478 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12479 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12482 if (SDValue BitBlend =
12483 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12486 // Try to lower by permuting the inputs into an unpack instruction.
12487 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12491 // If we can't directly blend but can use PSHUFB, that will be better as it
12492 // can both shuffle and set up the inefficient blend.
12493 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12494 bool V1InUse, V2InUse;
12495 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12496 Zeroable, DAG, V1InUse, V2InUse);
12499 // We can always bit-blend if we have to so the fallback strategy is to
12500 // decompose into single-input permutes and blends.
12501 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12505 /// Check whether a compaction lowering can be done by dropping even
12506 /// elements and compute how many times even elements must be dropped.
12508 /// This handles shuffles which take every Nth element where N is a power of
12509 /// two. Example shuffle masks:
12511 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12512 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12513 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12514 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12515 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12516 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12518 /// Any of these lanes can of course be undef.
12520 /// This routine only supports N <= 3.
12521 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12524 /// \returns N above, or the number of times even elements must be dropped if
12525 /// there is such a number. Otherwise returns zero.
12526 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12527 bool IsSingleInput) {
12528 // The modulus for the shuffle vector entries is based on whether this is
12529 // a single input or not.
12530 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12531 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12532 "We should only be called with masks with a power-of-2 size!");
12534 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12536 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12537 // and 2^3 simultaneously. This is because we may have ambiguity with
12538 // partially undef inputs.
12539 bool ViableForN[3] = {true, true, true};
12541 for (int i = 0, e = Mask.size(); i < e; ++i) {
12542 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12547 bool IsAnyViable = false;
12548 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12549 if (ViableForN[j]) {
12550 uint64_t N = j + 1;
12552 // The shuffle mask must be equal to (i * 2^N) % M.
12553 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12554 IsAnyViable = true;
12556 ViableForN[j] = false;
12558 // Early exit if we exhaust the possible powers of two.
12563 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12567 // Return 0 as there is no viable power of two.
12571 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12572 ArrayRef<int> Mask, SDValue V1,
12573 SDValue V2, SelectionDAG &DAG) {
12574 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12575 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12577 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12579 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12581 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12584 /// Generic lowering of v16i8 shuffles.
12586 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12587 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12588 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12589 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12591 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12592 const APInt &Zeroable,
12593 SDValue V1, SDValue V2,
12594 const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12597 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12598 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12600 // Try to use shift instructions.
12601 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12602 Zeroable, Subtarget, DAG))
12605 // Try to use byte rotation instructions.
12606 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12607 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12610 // Use dedicated pack instructions for masks that match their pattern.
12611 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12615 // Try to use a zext lowering.
12616 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12617 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12620 // See if we can use SSE4A Extraction / Insertion.
12621 if (Subtarget.hasSSE4A())
12622 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12626 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12628 // For single-input shuffles, there are some nicer lowering tricks we can use.
12629 if (NumV2Elements == 0) {
12630 // Check for being able to broadcast a single element.
12631 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12632 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12635 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12636 // Notably, this handles splat and partial-splat shuffles more efficiently.
12637 // However, it only makes sense if the pre-duplication shuffle simplifies
12638 // things significantly. Currently, this means we need to be able to
12639 // express the pre-duplication shuffle as an i16 shuffle.
12641 // FIXME: We should check for other patterns which can be widened into an
12642 // i16 shuffle as well.
12643 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12644 for (int i = 0; i < 16; i += 2)
12645 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12650 auto tryToWidenViaDuplication = [&]() -> SDValue {
12651 if (!canWidenViaDuplication(Mask))
12653 SmallVector<int, 4> LoInputs;
12654 copy_if(Mask, std::back_inserter(LoInputs),
12655 [](int M) { return M >= 0 && M < 8; });
12656 array_pod_sort(LoInputs.begin(), LoInputs.end());
12657 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12659 SmallVector<int, 4> HiInputs;
12660 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12661 array_pod_sort(HiInputs.begin(), HiInputs.end());
12662 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12665 bool TargetLo = LoInputs.size() >= HiInputs.size();
12666 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12667 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12669 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12670 SmallDenseMap<int, int, 8> LaneMap;
12671 for (int I : InPlaceInputs) {
12672 PreDupI16Shuffle[I/2] = I/2;
12675 int j = TargetLo ? 0 : 4, je = j + 4;
12676 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12677 // Check if j is already a shuffle of this input. This happens when
12678 // there are two adjacent bytes after we move the low one.
12679 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12680 // If we haven't yet mapped the input, search for a slot into which
12682 while (j < je && PreDupI16Shuffle[j] >= 0)
12686 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12689 // Map this input with the i16 shuffle.
12690 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12693 // Update the lane map based on the mapping we ended up with.
12694 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12696 V1 = DAG.getBitcast(
12698 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12699 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12701 // Unpack the bytes to form the i16s that will be shuffled into place.
12702 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12703 MVT::v16i8, V1, V1);
12705 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12706 for (int i = 0; i < 16; ++i)
12707 if (Mask[i] >= 0) {
12708 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12709 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12710 if (PostDupI16Shuffle[i / 2] < 0)
12711 PostDupI16Shuffle[i / 2] = MappedMask;
12713 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12714 "Conflicting entries in the original shuffle!");
12716 return DAG.getBitcast(
12718 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12719 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12721 if (SDValue V = tryToWidenViaDuplication())
12725 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12729 // Use dedicated unpack instructions for masks that match their pattern.
12731 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12734 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12735 // with PSHUFB. It is important to do this before we attempt to generate any
12736 // blends but after all of the single-input lowerings. If the single input
12737 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12738 // want to preserve that and we can DAG combine any longer sequences into
12739 // a PSHUFB in the end. But once we start blending from multiple inputs,
12740 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12741 // and there are *very* few patterns that would actually be faster than the
12742 // PSHUFB approach because of its ability to zero lanes.
12744 // FIXME: The only exceptions to the above are blends which are exact
12745 // interleavings with direct instructions supporting them. We currently don't
12746 // handle those well here.
12747 if (Subtarget.hasSSSE3()) {
12748 bool V1InUse = false;
12749 bool V2InUse = false;
12751 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12752 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12754 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12755 // do so. This avoids using them to handle blends-with-zero which is
12756 // important as a single pshufb is significantly faster for that.
12757 if (V1InUse && V2InUse) {
12758 if (Subtarget.hasSSE41())
12759 if (SDValue Blend = lowerVectorShuffleAsBlend(
12760 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12763 // We can use an unpack to do the blending rather than an or in some
12764 // cases. Even though the or may be (very minorly) more efficient, we
12765 // preference this lowering because there are common cases where part of
12766 // the complexity of the shuffles goes away when we do the final blend as
12768 // FIXME: It might be worth trying to detect if the unpack-feeding
12769 // shuffles will both be pshufb, in which case we shouldn't bother with
12771 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12772 DL, MVT::v16i8, V1, V2, Mask, DAG))
12775 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12776 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12777 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12783 // There are special ways we can lower some single-element blends.
12784 if (NumV2Elements == 1)
12785 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12786 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12789 if (SDValue BitBlend =
12790 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12793 // Check whether a compaction lowering can be done. This handles shuffles
12794 // which take every Nth element for some even N. See the helper function for
12797 // We special case these as they can be particularly efficiently handled with
12798 // the PACKUSB instruction on x86 and they show up in common patterns of
12799 // rearranging bytes to truncate wide elements.
12800 bool IsSingleInput = V2.isUndef();
12801 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12802 // NumEvenDrops is the power of two stride of the elements. Another way of
12803 // thinking about it is that we need to drop the even elements this many
12804 // times to get the original input.
12806 // First we need to zero all the dropped bytes.
12807 assert(NumEvenDrops <= 3 &&
12808 "No support for dropping even elements more than 3 times.");
12809 // We use the mask type to pick which bytes are preserved based on how many
12810 // elements are dropped.
12811 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12812 SDValue ByteClearMask = DAG.getBitcast(
12813 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12814 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12815 if (!IsSingleInput)
12816 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12818 // Now pack things back together.
12819 V1 = DAG.getBitcast(MVT::v8i16, V1);
12820 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12821 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12822 for (int i = 1; i < NumEvenDrops; ++i) {
12823 Result = DAG.getBitcast(MVT::v8i16, Result);
12824 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12830 // Handle multi-input cases by blending single-input shuffles.
12831 if (NumV2Elements > 0)
12832 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12835 // The fallback path for single-input shuffles widens this into two v8i16
12836 // vectors with unpacks, shuffles those, and then pulls them back together
12840 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12841 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12842 for (int i = 0; i < 16; ++i)
12844 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12846 SDValue VLoHalf, VHiHalf;
12847 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12848 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12850 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12851 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12852 // Use a mask to drop the high bytes.
12853 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12854 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12855 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12857 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12858 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12860 // Squash the masks to point directly into VLoHalf.
12861 for (int &M : LoBlendMask)
12864 for (int &M : HiBlendMask)
12868 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12869 // VHiHalf so that we can blend them as i16s.
12870 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12872 VLoHalf = DAG.getBitcast(
12873 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12874 VHiHalf = DAG.getBitcast(
12875 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12878 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12879 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12881 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12884 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
12886 /// This routine breaks down the specific type of 128-bit shuffle and
12887 /// dispatches to the lowering routines accordingly.
12888 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12889 MVT VT, SDValue V1, SDValue V2,
12890 const APInt &Zeroable,
12891 const X86Subtarget &Subtarget,
12892 SelectionDAG &DAG) {
12893 switch (VT.SimpleTy) {
12895 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12897 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12899 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12901 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12903 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12905 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12908 llvm_unreachable("Unimplemented!");
12912 /// Generic routine to split vector shuffle into half-sized shuffles.
12914 /// This routine just extracts two subvectors, shuffles them independently, and
12915 /// then concatenates them back together. This should work effectively with all
12916 /// AVX vector shuffle types.
12917 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12918 SDValue V2, ArrayRef<int> Mask,
12919 SelectionDAG &DAG) {
12920 assert(VT.getSizeInBits() >= 256 &&
12921 "Only for 256-bit or wider vector shuffles!");
12922 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12923 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12925 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12926 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12928 int NumElements = VT.getVectorNumElements();
12929 int SplitNumElements = NumElements / 2;
12930 MVT ScalarVT = VT.getVectorElementType();
12931 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12933 // Rather than splitting build-vectors, just build two narrower build
12934 // vectors. This helps shuffling with splats and zeros.
12935 auto SplitVector = [&](SDValue V) {
12936 V = peekThroughBitcasts(V);
12938 MVT OrigVT = V.getSimpleValueType();
12939 int OrigNumElements = OrigVT.getVectorNumElements();
12940 int OrigSplitNumElements = OrigNumElements / 2;
12941 MVT OrigScalarVT = OrigVT.getVectorElementType();
12942 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12946 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12948 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12949 DAG.getIntPtrConstant(0, DL));
12950 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12951 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12954 SmallVector<SDValue, 16> LoOps, HiOps;
12955 for (int i = 0; i < OrigSplitNumElements; ++i) {
12956 LoOps.push_back(BV->getOperand(i));
12957 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12959 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12960 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12962 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12963 DAG.getBitcast(SplitVT, HiV));
12966 SDValue LoV1, HiV1, LoV2, HiV2;
12967 std::tie(LoV1, HiV1) = SplitVector(V1);
12968 std::tie(LoV2, HiV2) = SplitVector(V2);
12970 // Now create two 4-way blends of these half-width vectors.
12971 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12972 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12973 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12974 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12975 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12976 for (int i = 0; i < SplitNumElements; ++i) {
12977 int M = HalfMask[i];
12978 if (M >= NumElements) {
12979 if (M >= NumElements + SplitNumElements)
12983 V2BlendMask[i] = M - NumElements;
12984 BlendMask[i] = SplitNumElements + i;
12985 } else if (M >= 0) {
12986 if (M >= SplitNumElements)
12990 V1BlendMask[i] = M;
12995 // Because the lowering happens after all combining takes place, we need to
12996 // manually combine these blend masks as much as possible so that we create
12997 // a minimal number of high-level vector shuffle nodes.
12999 // First try just blending the halves of V1 or V2.
13000 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
13001 return DAG.getUNDEF(SplitVT);
13002 if (!UseLoV2 && !UseHiV2)
13003 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13004 if (!UseLoV1 && !UseHiV1)
13005 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13007 SDValue V1Blend, V2Blend;
13008 if (UseLoV1 && UseHiV1) {
13010 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13012 // We only use half of V1 so map the usage down into the final blend mask.
13013 V1Blend = UseLoV1 ? LoV1 : HiV1;
13014 for (int i = 0; i < SplitNumElements; ++i)
13015 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
13016 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
13018 if (UseLoV2 && UseHiV2) {
13020 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13022 // We only use half of V2 so map the usage down into the final blend mask.
13023 V2Blend = UseLoV2 ? LoV2 : HiV2;
13024 for (int i = 0; i < SplitNumElements; ++i)
13025 if (BlendMask[i] >= SplitNumElements)
13026 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
13028 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
13030 SDValue Lo = HalfBlend(LoMask);
13031 SDValue Hi = HalfBlend(HiMask);
13032 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13035 /// Either split a vector in halves or decompose the shuffles and the
13038 /// This is provided as a good fallback for many lowerings of non-single-input
13039 /// shuffles with more than one 128-bit lane. In those cases, we want to select
13040 /// between splitting the shuffle into 128-bit components and stitching those
13041 /// back together vs. extracting the single-input shuffles and blending those
13043 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
13044 SDValue V1, SDValue V2,
13045 ArrayRef<int> Mask,
13046 SelectionDAG &DAG) {
13047 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13048 "shuffles as it could then recurse on itself.");
13049 int Size = Mask.size();
13051 // If this can be modeled as a broadcast of two elements followed by a blend,
13052 // prefer that lowering. This is especially important because broadcasts can
13053 // often fold with memory operands.
13054 auto DoBothBroadcast = [&] {
13055 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13058 if (V2BroadcastIdx < 0)
13059 V2BroadcastIdx = M - Size;
13060 else if (M - Size != V2BroadcastIdx)
13062 } else if (M >= 0) {
13063 if (V1BroadcastIdx < 0)
13064 V1BroadcastIdx = M;
13065 else if (M != V1BroadcastIdx)
13070 if (DoBothBroadcast())
13071 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13074 // If the inputs all stem from a single 128-bit lane of each input, then we
13075 // split them rather than blending because the split will decompose to
13076 // unusually few instructions.
13077 int LaneCount = VT.getSizeInBits() / 128;
13078 int LaneSize = Size / LaneCount;
13079 SmallBitVector LaneInputs[2];
13080 LaneInputs[0].resize(LaneCount, false);
13081 LaneInputs[1].resize(LaneCount, false);
13082 for (int i = 0; i < Size; ++i)
13084 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13085 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13086 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13088 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13089 // that the decomposed single-input shuffles don't end up here.
13090 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
13093 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13094 /// a permutation and blend of those lanes.
13096 /// This essentially blends the out-of-lane inputs to each lane into the lane
13097 /// from a permuted copy of the vector. This lowering strategy results in four
13098 /// instructions in the worst case for a single-input cross lane shuffle which
13099 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13100 /// of. Special cases for each particular shuffle pattern should be handled
13101 /// prior to trying this lowering.
13102 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13103 SDValue V1, SDValue V2,
13104 ArrayRef<int> Mask,
13106 const X86Subtarget &Subtarget) {
13107 // FIXME: This should probably be generalized for 512-bit vectors as well.
13108 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13109 int Size = Mask.size();
13110 int LaneSize = Size / 2;
13112 // If there are only inputs from one 128-bit lane, splitting will in fact be
13113 // less expensive. The flags track whether the given lane contains an element
13114 // that crosses to another lane.
13115 if (!Subtarget.hasAVX2()) {
13116 bool LaneCrossing[2] = {false, false};
13117 for (int i = 0; i < Size; ++i)
13118 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13119 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13120 if (!LaneCrossing[0] || !LaneCrossing[1])
13121 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13123 bool LaneUsed[2] = {false, false};
13124 for (int i = 0; i < Size; ++i)
13126 LaneUsed[(Mask[i] / LaneSize)] = true;
13127 if (!LaneUsed[0] || !LaneUsed[1])
13128 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13131 assert(V2.isUndef() &&
13132 "This last part of this routine only works on single input shuffles");
13134 SmallVector<int, 32> FlippedBlendMask(Size);
13135 for (int i = 0; i < Size; ++i)
13136 FlippedBlendMask[i] =
13137 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13139 : Mask[i] % LaneSize +
13140 (i / LaneSize) * LaneSize + Size);
13142 // Flip the vector, and blend the results which should now be in-lane.
13143 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13144 SDValue Flipped = DAG.getBitcast(PVT, V1);
13145 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13147 Flipped = DAG.getBitcast(VT, Flipped);
13148 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13151 /// Handle lowering 2-lane 128-bit shuffles.
13152 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13153 SDValue V2, ArrayRef<int> Mask,
13154 const APInt &Zeroable,
13155 const X86Subtarget &Subtarget,
13156 SelectionDAG &DAG) {
13157 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13158 if (Subtarget.hasAVX2() && V2.isUndef())
13161 SmallVector<int, 4> WidenedMask;
13162 if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
13165 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13166 bool IsHighZero = (Zeroable & 0xc) == 0xc;
13168 // Try to use an insert into a zero vector.
13169 if (WidenedMask[0] == 0 && IsHighZero) {
13170 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13171 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13172 DAG.getIntPtrConstant(0, DL));
13173 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13174 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13175 DAG.getIntPtrConstant(0, DL));
13178 // TODO: If minimizing size and one of the inputs is a zero vector and the
13179 // the zero vector has only one use, we could use a VPERM2X128 to save the
13180 // instruction bytes needed to explicitly generate the zero vector.
13182 // Blends are faster and handle all the non-lane-crossing cases.
13183 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
13184 Zeroable, Subtarget, DAG))
13187 // If either input operand is a zero vector, use VPERM2X128 because its mask
13188 // allows us to replace the zero input with an implicit zero.
13189 if (!IsLowZero && !IsHighZero) {
13190 // Check for patterns which can be matched with a single insert of a 128-bit
13192 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13193 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13195 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13196 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13197 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13198 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13199 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13200 OnlyUsesV1 ? V1 : V2,
13201 DAG.getIntPtrConstant(0, DL));
13202 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13203 DAG.getIntPtrConstant(2, DL));
13207 // Try to use SHUF128 if possible.
13208 if (Subtarget.hasVLX()) {
13209 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13210 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13211 ((WidenedMask[1] % 2) << 1);
13212 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13213 DAG.getConstant(PermMask, DL, MVT::i8));
13218 // Otherwise form a 128-bit permutation. After accounting for undefs,
13219 // convert the 64-bit shuffle mask selection values into 128-bit
13220 // selection bits by dividing the indexes by 2 and shifting into positions
13221 // defined by a vperm2*128 instruction's immediate control byte.
13223 // The immediate permute control byte looks like this:
13224 // [1:0] - select 128 bits from sources for low half of destination
13226 // [3] - zero low half of destination
13227 // [5:4] - select 128 bits from sources for high half of destination
13229 // [7] - zero high half of destination
13231 assert((WidenedMask[0] >= 0 || IsLowZero) &&
13232 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
13234 unsigned PermMask = 0;
13235 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13236 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13238 // Check the immediate mask and replace unused sources with undef.
13239 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13240 V1 = DAG.getUNDEF(VT);
13241 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13242 V2 = DAG.getUNDEF(VT);
13244 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13245 DAG.getConstant(PermMask, DL, MVT::i8));
13248 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
13249 /// shuffling each lane.
13251 /// This will only succeed when the result of fixing the 128-bit lanes results
13252 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13253 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13254 /// the lane crosses early and then use simpler shuffles within each lane.
13256 /// FIXME: It might be worthwhile at some point to support this without
13257 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13258 /// in x86 only floating point has interesting non-repeating shuffles, and even
13259 /// those are still *marginally* more expensive.
13260 static SDValue lowerVectorShuffleByMerging128BitLanes(
13261 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13262 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13263 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13265 int Size = Mask.size();
13266 int LaneSize = 128 / VT.getScalarSizeInBits();
13267 int NumLanes = Size / LaneSize;
13268 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13270 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13271 // check whether the in-128-bit lane shuffles share a repeating pattern.
13272 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13273 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13274 for (int i = 0; i < Size; ++i) {
13278 int j = i / LaneSize;
13280 if (Lanes[j] < 0) {
13281 // First entry we've seen for this lane.
13282 Lanes[j] = Mask[i] / LaneSize;
13283 } else if (Lanes[j] != Mask[i] / LaneSize) {
13284 // This doesn't match the lane selected previously!
13288 // Check that within each lane we have a consistent shuffle mask.
13289 int k = i % LaneSize;
13290 if (InLaneMask[k] < 0) {
13291 InLaneMask[k] = Mask[i] % LaneSize;
13292 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13293 // This doesn't fit a repeating in-lane mask.
13298 // First shuffle the lanes into place.
13299 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13300 VT.getSizeInBits() / 64);
13301 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13302 for (int i = 0; i < NumLanes; ++i)
13303 if (Lanes[i] >= 0) {
13304 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13305 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13308 V1 = DAG.getBitcast(LaneVT, V1);
13309 V2 = DAG.getBitcast(LaneVT, V2);
13310 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13312 // Cast it back to the type we actually want.
13313 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13315 // Now do a simple shuffle that isn't lane crossing.
13316 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13317 for (int i = 0; i < Size; ++i)
13319 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13320 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13321 "Must not introduce lane crosses at this point!");
13323 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13326 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13327 /// This allows for fast cases such as subvector extraction/insertion
13328 /// or shuffling smaller vector types which can lower more efficiently.
13329 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13330 SDValue V1, SDValue V2,
13331 ArrayRef<int> Mask,
13332 const X86Subtarget &Subtarget,
13333 SelectionDAG &DAG) {
13334 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13335 "Expected 256-bit or 512-bit vector");
13337 unsigned NumElts = VT.getVectorNumElements();
13338 unsigned HalfNumElts = NumElts / 2;
13339 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13341 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13342 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13343 if (!UndefLower && !UndefUpper)
13346 // Upper half is undef and lower half is whole upper subvector.
13347 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13349 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13350 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13351 DAG.getIntPtrConstant(HalfNumElts, DL));
13352 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13353 DAG.getIntPtrConstant(0, DL));
13356 // Lower half is undef and upper half is whole lower subvector.
13357 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13359 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13360 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13361 DAG.getIntPtrConstant(0, DL));
13362 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13363 DAG.getIntPtrConstant(HalfNumElts, DL));
13366 // If the shuffle only uses two of the four halves of the input operands,
13367 // then extract them and perform the 'half' shuffle at half width.
13368 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13369 int HalfIdx1 = -1, HalfIdx2 = -1;
13370 SmallVector<int, 8> HalfMask(HalfNumElts);
13371 unsigned Offset = UndefLower ? HalfNumElts : 0;
13372 for (unsigned i = 0; i != HalfNumElts; ++i) {
13373 int M = Mask[i + Offset];
13379 // Determine which of the 4 half vectors this element is from.
13380 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13381 int HalfIdx = M / HalfNumElts;
13383 // Determine the element index into its half vector source.
13384 int HalfElt = M % HalfNumElts;
13386 // We can shuffle with up to 2 half vectors, set the new 'half'
13387 // shuffle mask accordingly.
13388 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13389 HalfMask[i] = HalfElt;
13390 HalfIdx1 = HalfIdx;
13393 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13394 HalfMask[i] = HalfElt + HalfNumElts;
13395 HalfIdx2 = HalfIdx;
13399 // Too many half vectors referenced.
13402 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13404 // Only shuffle the halves of the inputs when useful.
13405 int NumLowerHalves =
13406 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13407 int NumUpperHalves =
13408 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13410 // uuuuXXXX - don't extract uppers just to insert again.
13411 if (UndefLower && NumUpperHalves != 0)
13414 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13415 if (UndefUpper && NumUpperHalves == 2)
13418 // AVX2 - XXXXuuuu - always extract lowers.
13419 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13420 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13421 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13423 // AVX2 supports variable 32-bit element cross-lane shuffles.
13424 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13425 // XXXXuuuu - don't extract lowers and uppers.
13426 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13431 // AVX512 - XXXXuuuu - always extract lowers.
13432 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13435 auto GetHalfVector = [&](int HalfIdx) {
13437 return DAG.getUNDEF(HalfVT);
13438 SDValue V = (HalfIdx < 2 ? V1 : V2);
13439 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13440 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13441 DAG.getIntPtrConstant(HalfIdx, DL));
13444 SDValue Half1 = GetHalfVector(HalfIdx1);
13445 SDValue Half2 = GetHalfVector(HalfIdx2);
13446 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13447 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13448 DAG.getIntPtrConstant(Offset, DL));
13451 /// Test whether the specified input (0 or 1) is in-place blended by the
13454 /// This returns true if the elements from a particular input are already in the
13455 /// slot required by the given mask and require no permutation.
13456 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13457 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13458 int Size = Mask.size();
13459 for (int i = 0; i < Size; ++i)
13460 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13466 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13467 /// every lane can be represented as the same repeating mask - allowing us to
13468 /// shuffle the sources with the repeating shuffle and then permute the result
13469 /// to the destination lanes.
13470 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13471 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13472 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13473 int NumElts = VT.getVectorNumElements();
13474 int NumLanes = VT.getSizeInBits() / 128;
13475 int NumLaneElts = NumElts / NumLanes;
13477 // On AVX2 we may be able to just shuffle the lowest elements and then
13478 // broadcast the result.
13479 if (Subtarget.hasAVX2()) {
13480 for (unsigned BroadcastSize : {16, 32, 64}) {
13481 if (BroadcastSize <= VT.getScalarSizeInBits())
13483 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13485 // Attempt to match a repeating pattern every NumBroadcastElts,
13486 // accounting for UNDEFs but only references the lowest 128-bit
13487 // lane of the inputs.
13488 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13489 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13490 for (int j = 0; j != NumBroadcastElts; ++j) {
13491 int M = Mask[i + j];
13494 int &R = RepeatMask[j];
13495 if (0 != ((M % NumElts) / NumLaneElts))
13497 if (0 <= R && R != M)
13504 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13505 if (!FindRepeatingBroadcastMask(RepeatMask))
13508 // Shuffle the (lowest) repeated elements in place for broadcast.
13509 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13511 // Shuffle the actual broadcast.
13512 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13513 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13514 for (int j = 0; j != NumBroadcastElts; ++j)
13515 BroadcastMask[i + j] = j;
13516 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13521 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13522 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13525 // Bail if we already have a repeated lane shuffle mask.
13526 SmallVector<int, 8> RepeatedShuffleMask;
13527 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13530 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13531 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13532 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13533 int NumSubLanes = NumLanes * SubLaneScale;
13534 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13536 // Check that all the sources are coming from the same lane and see if we can
13537 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13538 // determine the source sub-lane for each destination sub-lane.
13539 int TopSrcSubLane = -1;
13540 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13541 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13542 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13543 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13545 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13546 // Extract the sub-lane mask, check that it all comes from the same lane
13547 // and normalize the mask entries to come from the first lane.
13549 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13550 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13551 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13554 int Lane = (M % NumElts) / NumLaneElts;
13555 if ((0 <= SrcLane) && (SrcLane != Lane))
13558 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13559 SubLaneMask[Elt] = LocalM;
13562 // Whole sub-lane is UNDEF.
13566 // Attempt to match against the candidate repeated sub-lane masks.
13567 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13568 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13569 for (int i = 0; i != NumSubLaneElts; ++i) {
13570 if (M1[i] < 0 || M2[i] < 0)
13572 if (M1[i] != M2[i])
13578 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13579 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13582 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13583 for (int i = 0; i != NumSubLaneElts; ++i) {
13584 int M = SubLaneMask[i];
13587 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13588 "Unexpected mask element");
13589 RepeatedSubLaneMask[i] = M;
13592 // Track the top most source sub-lane - by setting the remaining to UNDEF
13593 // we can greatly simplify shuffle matching.
13594 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13595 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13596 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13600 // Bail if we failed to find a matching repeated sub-lane mask.
13601 if (Dst2SrcSubLanes[DstSubLane] < 0)
13604 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13605 "Unexpected source lane");
13607 // Create a repeating shuffle mask for the entire vector.
13608 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13609 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13610 int Lane = SubLane / SubLaneScale;
13611 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13612 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13613 int M = RepeatedSubLaneMask[Elt];
13616 int Idx = (SubLane * NumSubLaneElts) + Elt;
13617 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13620 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13622 // Shuffle each source sub-lane to its destination.
13623 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13624 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13625 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13626 if (SrcSubLane < 0)
13628 for (int j = 0; j != NumSubLaneElts; ++j)
13629 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13632 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13636 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13637 unsigned &ShuffleImm,
13638 ArrayRef<int> Mask) {
13639 int NumElts = VT.getVectorNumElements();
13640 assert(VT.getScalarSizeInBits() == 64 &&
13641 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13642 "Unexpected data type for VSHUFPD");
13644 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13645 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13647 bool ShufpdMask = true;
13648 bool CommutableMask = true;
13649 for (int i = 0; i < NumElts; ++i) {
13650 if (Mask[i] == SM_SentinelUndef)
13654 int Val = (i & 6) + NumElts * (i & 1);
13655 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13656 if (Mask[i] < Val || Mask[i] > Val + 1)
13657 ShufpdMask = false;
13658 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13659 CommutableMask = false;
13660 ShuffleImm |= (Mask[i] % 2) << i;
13665 if (CommutableMask) {
13673 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13674 ArrayRef<int> Mask, SDValue V1,
13675 SDValue V2, SelectionDAG &DAG) {
13676 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13677 "Unexpected data type for VSHUFPD");
13679 unsigned Immediate = 0;
13680 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13683 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13684 DAG.getConstant(Immediate, DL, MVT::i8));
13687 /// Handle lowering of 4-lane 64-bit floating point shuffles.
13689 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13690 /// isn't available.
13691 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13692 const APInt &Zeroable,
13693 SDValue V1, SDValue V2,
13694 const X86Subtarget &Subtarget,
13695 SelectionDAG &DAG) {
13696 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13697 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13698 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13700 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13701 Zeroable, Subtarget, DAG))
13704 if (V2.isUndef()) {
13705 // Check for being able to broadcast a single element.
13706 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13707 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13710 // Use low duplicate instructions for masks that match their pattern.
13711 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13712 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13714 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13715 // Non-half-crossing single input shuffles can be lowered with an
13716 // interleaved permutation.
13717 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13718 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13719 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13720 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13723 // With AVX2 we have direct support for this permutation.
13724 if (Subtarget.hasAVX2())
13725 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13726 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13728 // Try to create an in-lane repeating shuffle mask and then shuffle the
13729 // results into the target lanes.
13730 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13731 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13734 // Otherwise, fall back.
13735 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13739 // Use dedicated unpack instructions for masks that match their pattern.
13741 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13744 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13745 Zeroable, Subtarget, DAG))
13748 // Check if the blend happens to exactly fit that of SHUFPD.
13750 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13753 // Try to create an in-lane repeating shuffle mask and then shuffle the
13754 // results into the target lanes.
13755 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13756 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13759 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13760 // shuffle. However, if we have AVX2 and either inputs are already in place,
13761 // we will be able to shuffle even across lanes the other input in a single
13762 // instruction so skip this pattern.
13763 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13764 isShuffleMaskInputInPlace(1, Mask))))
13765 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13766 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13768 // If we have VLX support, we can use VEXPAND.
13769 if (Subtarget.hasVLX())
13770 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13771 V1, V2, DAG, Subtarget))
13774 // If we have AVX2 then we always want to lower with a blend because an v4 we
13775 // can fully permute the elements.
13776 if (Subtarget.hasAVX2())
13777 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13780 // Otherwise fall back on generic lowering.
13781 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13784 /// Handle lowering of 4-lane 64-bit integer shuffles.
13786 /// This routine is only called when we have AVX2 and thus a reasonable
13787 /// instruction set for v4i64 shuffling..
13788 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13789 const APInt &Zeroable,
13790 SDValue V1, SDValue V2,
13791 const X86Subtarget &Subtarget,
13792 SelectionDAG &DAG) {
13793 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13794 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13795 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13796 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13798 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13799 Zeroable, Subtarget, DAG))
13802 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13803 Zeroable, Subtarget, DAG))
13806 // Check for being able to broadcast a single element.
13807 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13808 Mask, Subtarget, DAG))
13811 if (V2.isUndef()) {
13812 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13813 // can use lower latency instructions that will operate on both lanes.
13814 SmallVector<int, 2> RepeatedMask;
13815 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13816 SmallVector<int, 4> PSHUFDMask;
13817 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13818 return DAG.getBitcast(
13820 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13821 DAG.getBitcast(MVT::v8i32, V1),
13822 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13825 // AVX2 provides a direct instruction for permuting a single input across
13827 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13828 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13831 // Try to use shift instructions.
13832 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13833 Zeroable, Subtarget, DAG))
13836 // If we have VLX support, we can use VALIGN or VEXPAND.
13837 if (Subtarget.hasVLX()) {
13838 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13839 Mask, Subtarget, DAG))
13842 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13843 V1, V2, DAG, Subtarget))
13847 // Try to use PALIGNR.
13848 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13849 Mask, Subtarget, DAG))
13852 // Use dedicated unpack instructions for masks that match their pattern.
13854 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13857 // Try to create an in-lane repeating shuffle mask and then shuffle the
13858 // results into the target lanes.
13859 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13860 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13863 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13864 // shuffle. However, if we have AVX2 and either inputs are already in place,
13865 // we will be able to shuffle even across lanes the other input in a single
13866 // instruction so skip this pattern.
13867 if (!isShuffleMaskInputInPlace(0, Mask) &&
13868 !isShuffleMaskInputInPlace(1, Mask))
13869 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13870 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13873 // Otherwise fall back on generic blend lowering.
13874 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13878 /// Handle lowering of 8-lane 32-bit floating point shuffles.
13880 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13881 /// isn't available.
13882 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13883 const APInt &Zeroable,
13884 SDValue V1, SDValue V2,
13885 const X86Subtarget &Subtarget,
13886 SelectionDAG &DAG) {
13887 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13888 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13889 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13891 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13892 Zeroable, Subtarget, DAG))
13895 // Check for being able to broadcast a single element.
13896 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13897 Mask, Subtarget, DAG))
13900 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13901 // options to efficiently lower the shuffle.
13902 SmallVector<int, 4> RepeatedMask;
13903 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13904 assert(RepeatedMask.size() == 4 &&
13905 "Repeated masks must be half the mask width!");
13907 // Use even/odd duplicate instructions for masks that match their pattern.
13908 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13909 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13910 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13911 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13914 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13915 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13917 // Use dedicated unpack instructions for masks that match their pattern.
13919 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13922 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13923 // have already handled any direct blends.
13924 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13927 // Try to create an in-lane repeating shuffle mask and then shuffle the
13928 // results into the target lanes.
13929 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13930 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13933 // If we have a single input shuffle with different shuffle patterns in the
13934 // two 128-bit lanes use the variable mask to VPERMILPS.
13935 if (V2.isUndef()) {
13936 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13937 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13938 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13940 if (Subtarget.hasAVX2())
13941 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13943 // Otherwise, fall back.
13944 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13948 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13950 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13951 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13953 // If we have VLX support, we can use VEXPAND.
13954 if (Subtarget.hasVLX())
13955 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13956 V1, V2, DAG, Subtarget))
13959 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13960 // since after split we get a more efficient code using vpunpcklwd and
13961 // vpunpckhwd instrs than vblend.
13962 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13963 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13967 // If we have AVX2 then we always want to lower with a blend because at v8 we
13968 // can fully permute the elements.
13969 if (Subtarget.hasAVX2())
13970 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13973 // Otherwise fall back on generic lowering.
13974 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13977 /// Handle lowering of 8-lane 32-bit integer shuffles.
13979 /// This routine is only called when we have AVX2 and thus a reasonable
13980 /// instruction set for v8i32 shuffling..
13981 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13982 const APInt &Zeroable,
13983 SDValue V1, SDValue V2,
13984 const X86Subtarget &Subtarget,
13985 SelectionDAG &DAG) {
13986 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13987 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13988 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13989 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13991 // Whenever we can lower this as a zext, that instruction is strictly faster
13992 // than any alternative. It also allows us to fold memory operands into the
13993 // shuffle in many cases.
13994 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13995 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13998 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13999 // since after split we get a more efficient code than vblend by using
14000 // vpunpcklwd and vpunpckhwd instrs.
14001 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
14002 !Subtarget.hasAVX512())
14004 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
14007 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
14008 Zeroable, Subtarget, DAG))
14011 // Check for being able to broadcast a single element.
14012 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
14013 Mask, Subtarget, DAG))
14016 // If the shuffle mask is repeated in each 128-bit lane we can use more
14017 // efficient instructions that mirror the shuffles across the two 128-bit
14019 SmallVector<int, 4> RepeatedMask;
14020 bool Is128BitLaneRepeatedShuffle =
14021 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
14022 if (Is128BitLaneRepeatedShuffle) {
14023 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14025 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
14026 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14028 // Use dedicated unpack instructions for masks that match their pattern.
14030 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
14034 // Try to use shift instructions.
14035 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
14036 Zeroable, Subtarget, DAG))
14039 // If we have VLX support, we can use VALIGN or EXPAND.
14040 if (Subtarget.hasVLX()) {
14041 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
14042 Mask, Subtarget, DAG))
14045 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
14046 V1, V2, DAG, Subtarget))
14050 // Try to use byte rotation instructions.
14051 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14052 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14055 // Try to create an in-lane repeating shuffle mask and then shuffle the
14056 // results into the target lanes.
14057 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14058 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14061 // If the shuffle patterns aren't repeated but it is a single input, directly
14062 // generate a cross-lane VPERMD instruction.
14063 if (V2.isUndef()) {
14064 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14065 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
14068 // Assume that a single SHUFPS is faster than an alternative sequence of
14069 // multiple instructions (even if the CPU has a domain penalty).
14070 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14071 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14072 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
14073 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
14074 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
14075 CastV1, CastV2, DAG);
14076 return DAG.getBitcast(MVT::v8i32, ShufPS);
14079 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14081 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14082 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14085 // Otherwise fall back on generic blend lowering.
14086 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
14090 /// Handle lowering of 16-lane 16-bit integer shuffles.
14092 /// This routine is only called when we have AVX2 and thus a reasonable
14093 /// instruction set for v16i16 shuffling..
14094 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14095 const APInt &Zeroable,
14096 SDValue V1, SDValue V2,
14097 const X86Subtarget &Subtarget,
14098 SelectionDAG &DAG) {
14099 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14100 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14101 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14102 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
14104 // Whenever we can lower this as a zext, that instruction is strictly faster
14105 // than any alternative. It also allows us to fold memory operands into the
14106 // shuffle in many cases.
14107 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14108 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14111 // Check for being able to broadcast a single element.
14112 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
14113 Mask, Subtarget, DAG))
14116 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
14117 Zeroable, Subtarget, DAG))
14120 // Use dedicated unpack instructions for masks that match their pattern.
14122 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
14125 // Use dedicated pack instructions for masks that match their pattern.
14126 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
14130 // Try to use shift instructions.
14131 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
14132 Zeroable, Subtarget, DAG))
14135 // Try to use byte rotation instructions.
14136 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14137 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14140 // Try to create an in-lane repeating shuffle mask and then shuffle the
14141 // results into the target lanes.
14142 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14143 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14146 if (V2.isUndef()) {
14147 // There are no generalized cross-lane shuffle operations available on i16
14149 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
14150 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
14151 Mask, DAG, Subtarget);
14153 SmallVector<int, 8> RepeatedMask;
14154 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
14155 // As this is a single-input shuffle, the repeated mask should be
14156 // a strictly valid v8i16 mask that we can pass through to the v8i16
14157 // lowering to handle even the v16 case.
14158 return lowerV8I16GeneralSingleInputVectorShuffle(
14159 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
14163 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14164 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14167 // AVX512BWVL can lower to VPERMW.
14168 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14169 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
14171 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14173 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14174 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14177 // Otherwise fall back on generic lowering.
14178 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
14181 /// Handle lowering of 32-lane 8-bit integer shuffles.
14183 /// This routine is only called when we have AVX2 and thus a reasonable
14184 /// instruction set for v32i8 shuffling..
14185 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14186 const APInt &Zeroable,
14187 SDValue V1, SDValue V2,
14188 const X86Subtarget &Subtarget,
14189 SelectionDAG &DAG) {
14190 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14191 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14192 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14193 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14195 // Whenever we can lower this as a zext, that instruction is strictly faster
14196 // than any alternative. It also allows us to fold memory operands into the
14197 // shuffle in many cases.
14198 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14199 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14202 // Check for being able to broadcast a single element.
14203 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14204 Mask, Subtarget, DAG))
14207 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14208 Zeroable, Subtarget, DAG))
14211 // Use dedicated unpack instructions for masks that match their pattern.
14213 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14216 // Use dedicated pack instructions for masks that match their pattern.
14217 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14221 // Try to use shift instructions.
14222 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14223 Zeroable, Subtarget, DAG))
14226 // Try to use byte rotation instructions.
14227 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14228 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14231 // Try to create an in-lane repeating shuffle mask and then shuffle the
14232 // results into the target lanes.
14233 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14234 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14237 // There are no generalized cross-lane shuffle operations available on i8
14239 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14240 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14243 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14244 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14247 // AVX512VBMIVL can lower to VPERMB.
14248 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14249 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14251 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14253 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14254 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14257 // Otherwise fall back on generic lowering.
14258 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14261 /// High-level routine to lower various 256-bit x86 vector shuffles.
14263 /// This routine either breaks down the specific type of a 256-bit x86 vector
14264 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14265 /// together based on the available instructions.
14266 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14267 MVT VT, SDValue V1, SDValue V2,
14268 const APInt &Zeroable,
14269 const X86Subtarget &Subtarget,
14270 SelectionDAG &DAG) {
14271 // If we have a single input to the zero element, insert that into V1 if we
14272 // can do so cheaply.
14273 int NumElts = VT.getVectorNumElements();
14274 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14276 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14277 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14278 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14281 // Handle special cases where the lower or upper half is UNDEF.
14283 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14286 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14287 // can check for those subtargets here and avoid much of the subtarget
14288 // querying in the per-vector-type lowering routines. With AVX1 we have
14289 // essentially *zero* ability to manipulate a 256-bit vector with integer
14290 // types. Since we'll use floating point types there eventually, just
14291 // immediately cast everything to a float and operate entirely in that domain.
14292 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14293 int ElementBits = VT.getScalarSizeInBits();
14294 if (ElementBits < 32) {
14295 // No floating point type available, if we can't use the bit operations
14296 // for masking/blending then decompose into 128-bit vectors.
14298 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14300 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14302 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14305 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14306 VT.getVectorNumElements());
14307 V1 = DAG.getBitcast(FpVT, V1);
14308 V2 = DAG.getBitcast(FpVT, V2);
14309 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14312 switch (VT.SimpleTy) {
14314 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14316 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14318 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14320 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14322 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14324 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14327 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14331 /// Try to lower a vector shuffle as a 128-bit shuffles.
14332 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14333 ArrayRef<int> Mask,
14334 const APInt &Zeroable,
14335 SDValue V1, SDValue V2,
14336 const X86Subtarget &Subtarget,
14337 SelectionDAG &DAG) {
14338 assert(VT.getScalarSizeInBits() == 64 &&
14339 "Unexpected element type size for 128bit shuffle.");
14341 // To handle 256 bit vector requires VLX and most probably
14342 // function lowerV2X128VectorShuffle() is better solution.
14343 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14345 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
14346 SmallVector<int, 4> WidenedMask;
14347 if (!canWidenShuffleElements(Mask, WidenedMask))
14350 // Try to use an insert into a zero vector.
14351 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14352 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14353 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14354 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14355 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14356 DAG.getIntPtrConstant(0, DL));
14357 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14358 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14359 DAG.getIntPtrConstant(0, DL));
14362 // Check for patterns which can be matched with a single insert of a 256-bit
14364 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14365 {0, 1, 2, 3, 0, 1, 2, 3});
14366 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14367 {0, 1, 2, 3, 8, 9, 10, 11})) {
14368 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14369 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14370 OnlyUsesV1 ? V1 : V2,
14371 DAG.getIntPtrConstant(0, DL));
14372 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14373 DAG.getIntPtrConstant(4, DL));
14376 assert(WidenedMask.size() == 4);
14378 // See if this is an insertion of the lower 128-bits of V2 into V1.
14379 bool IsInsert = true;
14381 for (int i = 0; i < 4; ++i) {
14382 assert(WidenedMask[i] >= -1);
14383 if (WidenedMask[i] < 0)
14386 // Make sure all V1 subvectors are in place.
14387 if (WidenedMask[i] < 4) {
14388 if (WidenedMask[i] != i) {
14393 // Make sure we only have a single V2 index and its the lowest 128-bits.
14394 if (V2Index >= 0 || WidenedMask[i] != 4) {
14401 if (IsInsert && V2Index >= 0) {
14402 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14403 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14404 DAG.getIntPtrConstant(0, DL));
14405 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14408 // Try to lower to vshuf64x2/vshuf32x4.
14409 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14410 unsigned PermMask = 0;
14411 // Insure elements came from the same Op.
14412 for (int i = 0; i < 4; ++i) {
14413 assert(WidenedMask[i] >= -1);
14414 if (WidenedMask[i] < 0)
14417 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14418 unsigned OpIndex = i / 2;
14419 if (Ops[OpIndex].isUndef())
14421 else if (Ops[OpIndex] != Op)
14424 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14425 // bits defined by a vshuf64x2 instruction's immediate control byte.
14426 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14429 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14430 DAG.getConstant(PermMask, DL, MVT::i8));
14433 /// Handle lowering of 8-lane 64-bit floating point shuffles.
14434 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14435 const APInt &Zeroable,
14436 SDValue V1, SDValue V2,
14437 const X86Subtarget &Subtarget,
14438 SelectionDAG &DAG) {
14439 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14440 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14441 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14443 if (V2.isUndef()) {
14444 // Use low duplicate instructions for masks that match their pattern.
14445 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14446 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14448 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14449 // Non-half-crossing single input shuffles can be lowered with an
14450 // interleaved permutation.
14451 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14452 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14453 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14454 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14455 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14456 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14459 SmallVector<int, 4> RepeatedMask;
14460 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14461 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14462 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14465 if (SDValue Shuf128 =
14466 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14470 if (SDValue Unpck =
14471 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14474 // Check if the blend happens to exactly fit that of SHUFPD.
14476 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14479 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14480 V2, DAG, Subtarget))
14483 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14484 Zeroable, Subtarget, DAG))
14487 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14490 /// Handle lowering of 16-lane 32-bit floating point shuffles.
14491 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14492 const APInt &Zeroable,
14493 SDValue V1, SDValue V2,
14494 const X86Subtarget &Subtarget,
14495 SelectionDAG &DAG) {
14496 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14497 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14498 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14500 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14501 // options to efficiently lower the shuffle.
14502 SmallVector<int, 4> RepeatedMask;
14503 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14504 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14506 // Use even/odd duplicate instructions for masks that match their pattern.
14507 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14508 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14509 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14510 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14513 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14514 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14516 // Use dedicated unpack instructions for masks that match their pattern.
14517 if (SDValue Unpck =
14518 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14521 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14522 Zeroable, Subtarget, DAG))
14525 // Otherwise, fall back to a SHUFPS sequence.
14526 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14529 // If we have a single input shuffle with different shuffle patterns in the
14530 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14531 if (V2.isUndef() &&
14532 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14533 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14534 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14537 // If we have AVX512F support, we can use VEXPAND.
14538 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14539 V1, V2, DAG, Subtarget))
14542 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14545 /// Handle lowering of 8-lane 64-bit integer shuffles.
14546 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14547 const APInt &Zeroable,
14548 SDValue V1, SDValue V2,
14549 const X86Subtarget &Subtarget,
14550 SelectionDAG &DAG) {
14551 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14552 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14553 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14555 if (V2.isUndef()) {
14556 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14557 // can use lower latency instructions that will operate on all four
14559 SmallVector<int, 2> Repeated128Mask;
14560 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14561 SmallVector<int, 4> PSHUFDMask;
14562 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14563 return DAG.getBitcast(
14565 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14566 DAG.getBitcast(MVT::v16i32, V1),
14567 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14570 SmallVector<int, 4> Repeated256Mask;
14571 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14572 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14573 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14576 if (SDValue Shuf128 =
14577 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14578 V1, V2, Subtarget, DAG))
14581 // Try to use shift instructions.
14582 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14583 Zeroable, Subtarget, DAG))
14586 // Try to use VALIGN.
14587 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14588 Mask, Subtarget, DAG))
14591 // Try to use PALIGNR.
14592 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14593 Mask, Subtarget, DAG))
14596 if (SDValue Unpck =
14597 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14599 // If we have AVX512F support, we can use VEXPAND.
14600 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14601 V2, DAG, Subtarget))
14604 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14605 Zeroable, Subtarget, DAG))
14608 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14611 /// Handle lowering of 16-lane 32-bit integer shuffles.
14612 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14613 const APInt &Zeroable,
14614 SDValue V1, SDValue V2,
14615 const X86Subtarget &Subtarget,
14616 SelectionDAG &DAG) {
14617 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14618 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14619 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14621 // Whenever we can lower this as a zext, that instruction is strictly faster
14622 // than any alternative. It also allows us to fold memory operands into the
14623 // shuffle in many cases.
14624 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14625 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14628 // If the shuffle mask is repeated in each 128-bit lane we can use more
14629 // efficient instructions that mirror the shuffles across the four 128-bit
14631 SmallVector<int, 4> RepeatedMask;
14632 bool Is128BitLaneRepeatedShuffle =
14633 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14634 if (Is128BitLaneRepeatedShuffle) {
14635 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14637 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14638 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14640 // Use dedicated unpack instructions for masks that match their pattern.
14642 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14646 // Try to use shift instructions.
14647 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14648 Zeroable, Subtarget, DAG))
14651 // Try to use VALIGN.
14652 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14653 Mask, Subtarget, DAG))
14656 // Try to use byte rotation instructions.
14657 if (Subtarget.hasBWI())
14658 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14659 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14662 // Assume that a single SHUFPS is faster than using a permv shuffle.
14663 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14664 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14665 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14666 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14667 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14668 CastV1, CastV2, DAG);
14669 return DAG.getBitcast(MVT::v16i32, ShufPS);
14671 // If we have AVX512F support, we can use VEXPAND.
14672 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14673 V1, V2, DAG, Subtarget))
14676 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14677 Zeroable, Subtarget, DAG))
14679 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14682 /// Handle lowering of 32-lane 16-bit integer shuffles.
14683 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14684 const APInt &Zeroable,
14685 SDValue V1, SDValue V2,
14686 const X86Subtarget &Subtarget,
14687 SelectionDAG &DAG) {
14688 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14689 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14690 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14691 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14693 // Whenever we can lower this as a zext, that instruction is strictly faster
14694 // than any alternative. It also allows us to fold memory operands into the
14695 // shuffle in many cases.
14696 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14697 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14700 // Use dedicated unpack instructions for masks that match their pattern.
14702 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14705 // Try to use shift instructions.
14706 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14707 Zeroable, Subtarget, DAG))
14710 // Try to use byte rotation instructions.
14711 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14712 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14715 if (V2.isUndef()) {
14716 SmallVector<int, 8> RepeatedMask;
14717 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14718 // As this is a single-input shuffle, the repeated mask should be
14719 // a strictly valid v8i16 mask that we can pass through to the v8i16
14720 // lowering to handle even the v32 case.
14721 return lowerV8I16GeneralSingleInputVectorShuffle(
14722 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14726 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14727 Zeroable, Subtarget, DAG))
14730 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14731 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14734 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14737 /// Handle lowering of 64-lane 8-bit integer shuffles.
14738 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14739 const APInt &Zeroable,
14740 SDValue V1, SDValue V2,
14741 const X86Subtarget &Subtarget,
14742 SelectionDAG &DAG) {
14743 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14744 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14745 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14746 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14748 // Whenever we can lower this as a zext, that instruction is strictly faster
14749 // than any alternative. It also allows us to fold memory operands into the
14750 // shuffle in many cases.
14751 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14752 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14755 // Use dedicated unpack instructions for masks that match their pattern.
14757 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14760 // Try to use shift instructions.
14761 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14762 Zeroable, Subtarget, DAG))
14765 // Try to use byte rotation instructions.
14766 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14767 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14770 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14771 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14774 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14775 if (Subtarget.hasVBMI())
14776 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14778 // Try to create an in-lane repeating shuffle mask and then shuffle the
14779 // results into the target lanes.
14780 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14781 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14784 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14785 Zeroable, Subtarget, DAG))
14788 // FIXME: Implement direct support for this type!
14789 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14792 /// High-level routine to lower various 512-bit x86 vector shuffles.
14794 /// This routine either breaks down the specific type of a 512-bit x86 vector
14795 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14796 /// together based on the available instructions.
14797 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14798 MVT VT, SDValue V1, SDValue V2,
14799 const APInt &Zeroable,
14800 const X86Subtarget &Subtarget,
14801 SelectionDAG &DAG) {
14802 assert(Subtarget.hasAVX512() &&
14803 "Cannot lower 512-bit vectors w/ basic ISA!");
14805 // If we have a single input to the zero element, insert that into V1 if we
14806 // can do so cheaply.
14807 int NumElts = Mask.size();
14808 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14810 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14811 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14812 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14815 // Handle special cases where the lower or upper half is UNDEF.
14817 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14820 // Check for being able to broadcast a single element.
14821 if (SDValue Broadcast =
14822 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14825 // Dispatch to each element type for lowering. If we don't have support for
14826 // specific element type shuffles at 512 bits, immediately split them and
14827 // lower them. Each lowering routine of a given type is allowed to assume that
14828 // the requisite ISA extensions for that element type are available.
14829 switch (VT.SimpleTy) {
14831 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14833 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14835 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14837 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14839 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14841 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14844 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14848 // Lower vXi1 vector shuffles.
14849 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14850 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14851 // vector, shuffle and then truncate it back.
14852 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14853 MVT VT, SDValue V1, SDValue V2,
14854 const APInt &Zeroable,
14855 const X86Subtarget &Subtarget,
14856 SelectionDAG &DAG) {
14857 unsigned NumElts = Mask.size();
14859 // Try to recognize shuffles that are just padding a subvector with zeros.
14860 unsigned SubvecElts = 0;
14861 for (int i = 0; i != (int)NumElts; ++i) {
14862 if (Mask[i] >= 0 && Mask[i] != i)
14867 assert(SubvecElts != NumElts && "Identity shuffle?");
14869 // Clip to a power 2.
14870 SubvecElts = PowerOf2Floor(SubvecElts);
14872 // Make sure the number of zeroable bits in the top at least covers the bits
14873 // not covered by the subvector.
14874 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14875 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14876 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14877 V1, DAG.getIntPtrConstant(0, DL));
14878 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14879 getZeroVector(VT, Subtarget, DAG, DL),
14880 Extract, DAG.getIntPtrConstant(0, DL));
14884 assert(Subtarget.hasAVX512() &&
14885 "Cannot lower 512-bit vectors w/o basic ISA!");
14887 switch (VT.SimpleTy) {
14889 llvm_unreachable("Expected a vector of i1 elements");
14891 ExtVT = MVT::v2i64;
14894 ExtVT = MVT::v4i32;
14897 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14899 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14902 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14903 // 256-bit operation available.
14904 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14907 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14908 // 256-bit operation available.
14909 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14910 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14913 ExtVT = MVT::v64i8;
14917 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14918 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14920 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14921 // i1 was sign extended we can use X86ISD::CVT2MASK.
14922 int NumElems = VT.getVectorNumElements();
14923 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14924 (Subtarget.hasDQI() && (NumElems < 32)))
14925 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
14926 Shuffle, ISD::SETGT);
14928 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14931 /// Helper function that returns true if the shuffle mask should be
14932 /// commuted to improve canonicalization.
14933 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14934 int NumElements = Mask.size();
14936 int NumV1Elements = 0, NumV2Elements = 0;
14940 else if (M < NumElements)
14945 // Commute the shuffle as needed such that more elements come from V1 than
14946 // V2. This allows us to match the shuffle pattern strictly on how many
14947 // elements come from V1 without handling the symmetric cases.
14948 if (NumV2Elements > NumV1Elements)
14951 assert(NumV1Elements > 0 && "No V1 indices");
14953 if (NumV2Elements == 0)
14956 // When the number of V1 and V2 elements are the same, try to minimize the
14957 // number of uses of V2 in the low half of the vector. When that is tied,
14958 // ensure that the sum of indices for V1 is equal to or lower than the sum
14959 // indices for V2. When those are equal, try to ensure that the number of odd
14960 // indices for V1 is lower than the number of odd indices for V2.
14961 if (NumV1Elements == NumV2Elements) {
14962 int LowV1Elements = 0, LowV2Elements = 0;
14963 for (int M : Mask.slice(0, NumElements / 2))
14964 if (M >= NumElements)
14968 if (LowV2Elements > LowV1Elements)
14970 if (LowV2Elements == LowV1Elements) {
14971 int SumV1Indices = 0, SumV2Indices = 0;
14972 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14973 if (Mask[i] >= NumElements)
14975 else if (Mask[i] >= 0)
14977 if (SumV2Indices < SumV1Indices)
14979 if (SumV2Indices == SumV1Indices) {
14980 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14981 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14982 if (Mask[i] >= NumElements)
14983 NumV2OddIndices += i % 2;
14984 else if (Mask[i] >= 0)
14985 NumV1OddIndices += i % 2;
14986 if (NumV2OddIndices < NumV1OddIndices)
14995 /// Top-level lowering for x86 vector shuffles.
14997 /// This handles decomposition, canonicalization, and lowering of all x86
14998 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14999 /// above in helper routines. The canonicalization attempts to widen shuffles
15000 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
15001 /// s.t. only one of the two inputs needs to be tested, etc.
15002 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
15003 SelectionDAG &DAG) {
15004 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
15005 ArrayRef<int> Mask = SVOp->getMask();
15006 SDValue V1 = Op.getOperand(0);
15007 SDValue V2 = Op.getOperand(1);
15008 MVT VT = Op.getSimpleValueType();
15009 int NumElements = VT.getVectorNumElements();
15011 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
15013 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
15014 "Can't lower MMX shuffles");
15016 bool V1IsUndef = V1.isUndef();
15017 bool V2IsUndef = V2.isUndef();
15018 if (V1IsUndef && V2IsUndef)
15019 return DAG.getUNDEF(VT);
15021 // When we create a shuffle node we put the UNDEF node to second operand,
15022 // but in some cases the first operand may be transformed to UNDEF.
15023 // In this case we should just commute the node.
15025 return DAG.getCommutedVectorShuffle(*SVOp);
15027 // Check for non-undef masks pointing at an undef vector and make the masks
15028 // undef as well. This makes it easier to match the shuffle based solely on
15032 if (M >= NumElements) {
15033 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
15034 for (int &M : NewMask)
15035 if (M >= NumElements)
15037 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15040 // Check for illegal shuffle mask element index values.
15041 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
15042 assert(llvm::all_of(Mask,
15043 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
15044 "Out of bounds shuffle index");
15046 // We actually see shuffles that are entirely re-arrangements of a set of
15047 // zero inputs. This mostly happens while decomposing complex shuffles into
15048 // simple ones. Directly lower these as a buildvector of zeros.
15049 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
15050 if (Zeroable.isAllOnesValue())
15051 return getZeroVector(VT, Subtarget, DAG, DL);
15053 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
15055 // Create an alternative mask with info about zeroable elements.
15056 // Here we do not set undef elements as zeroable.
15057 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
15059 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
15060 for (int i = 0; i != NumElements; ++i)
15061 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
15062 ZeroableMask[i] = SM_SentinelZero;
15065 // Try to collapse shuffles into using a vector type with fewer elements but
15066 // wider element types. We cap this to not form integers or floating point
15067 // elements wider than 64 bits, but it might be interesting to form i128
15068 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
15069 SmallVector<int, 16> WidenedMask;
15070 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
15071 canWidenShuffleElements(ZeroableMask, WidenedMask)) {
15072 MVT NewEltVT = VT.isFloatingPoint()
15073 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
15074 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
15075 int NewNumElts = NumElements / 2;
15076 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
15077 // Make sure that the new vector type is legal. For example, v2f64 isn't
15079 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
15081 // Modify the new Mask to take all zeros from the all-zero vector.
15082 // Choose indices that are blend-friendly.
15083 bool UsedZeroVector = false;
15084 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
15085 "V2's non-undef elements are used?!");
15086 for (int i = 0; i != NewNumElts; ++i)
15087 if (WidenedMask[i] == SM_SentinelZero) {
15088 WidenedMask[i] = i + NewNumElts;
15089 UsedZeroVector = true;
15091 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
15092 // some elements to be undef.
15093 if (UsedZeroVector)
15094 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
15096 V1 = DAG.getBitcast(NewVT, V1);
15097 V2 = DAG.getBitcast(NewVT, V2);
15098 return DAG.getBitcast(
15099 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
15103 // Commute the shuffle if it will improve canonicalization.
15104 if (canonicalizeShuffleMaskWithCommute(Mask))
15105 return DAG.getCommutedVectorShuffle(*SVOp);
15108 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
15111 // For each vector width, delegate to a specialized lowering routine.
15112 if (VT.is128BitVector())
15113 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15116 if (VT.is256BitVector())
15117 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15120 if (VT.is512BitVector())
15121 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15125 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15128 llvm_unreachable("Unimplemented!");
15131 /// Try to lower a VSELECT instruction to a vector shuffle.
15132 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
15133 const X86Subtarget &Subtarget,
15134 SelectionDAG &DAG) {
15135 SDValue Cond = Op.getOperand(0);
15136 SDValue LHS = Op.getOperand(1);
15137 SDValue RHS = Op.getOperand(2);
15139 MVT VT = Op.getSimpleValueType();
15141 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
15143 auto *CondBV = cast<BuildVectorSDNode>(Cond);
15145 // Only non-legal VSELECTs reach this lowering, convert those into generic
15146 // shuffles and re-use the shuffle lowering path for blends.
15147 SmallVector<int, 32> Mask;
15148 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
15149 SDValue CondElt = CondBV->getOperand(i);
15151 // We can't map undef to undef here. They have different meanings. Treat
15152 // as the same as zero.
15153 if (CondElt.isUndef() || isNullConstant(CondElt))
15157 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
15160 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
15161 // A vselect where all conditions and data are constants can be optimized into
15162 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
15163 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
15164 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
15165 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
15168 // Try to lower this to a blend-style vector shuffle. This can handle all
15169 // constant condition cases.
15170 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
15173 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
15174 // with patterns on the mask registers on AVX-512.
15175 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
15178 // Variable blends are only legal from SSE4.1 onward.
15179 if (!Subtarget.hasSSE41())
15183 MVT VT = Op.getSimpleValueType();
15185 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
15186 // into an i1 condition so that we can use the mask-based 512-bit blend
15188 if (VT.getSizeInBits() == 512) {
15189 SDValue Cond = Op.getOperand(0);
15190 // The vNi1 condition case should be handled above as it can be trivially
15192 assert(Cond.getValueType().getScalarSizeInBits() ==
15193 VT.getScalarSizeInBits() &&
15194 "Should have a size-matched integer condition!");
15195 // Build a mask by testing the condition against zero.
15196 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
15197 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
15198 getZeroVector(VT, Subtarget, DAG, dl),
15200 // Now return a new VSELECT using the mask.
15201 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
15204 // Only some types will be legal on some subtargets. If we can emit a legal
15205 // VSELECT-matching blend, return Op, and but if we need to expand, return
15207 switch (VT.SimpleTy) {
15209 // Most of the vector types have blends past SSE4.1.
15213 // The byte blends for AVX vectors were introduced only in AVX2.
15214 if (Subtarget.hasAVX2())
15220 case MVT::v16i16: {
15221 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
15222 MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
15223 SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
15224 SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
15225 SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
15226 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
15227 return DAG.getBitcast(VT, Select);
15232 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15233 MVT VT = Op.getSimpleValueType();
15236 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15239 if (VT.getSizeInBits() == 8) {
15240 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15241 Op.getOperand(0), Op.getOperand(1));
15242 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15245 if (VT == MVT::f32) {
15246 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15247 // the result back to FR32 register. It's only worth matching if the
15248 // result has a single use which is a store or a bitcast to i32. And in
15249 // the case of a store, it's not worth it if the index is a constant 0,
15250 // because a MOVSSmr can be used instead, which is smaller and faster.
15251 if (!Op.hasOneUse())
15253 SDNode *User = *Op.getNode()->use_begin();
15254 if ((User->getOpcode() != ISD::STORE ||
15255 isNullConstant(Op.getOperand(1))) &&
15256 (User->getOpcode() != ISD::BITCAST ||
15257 User->getValueType(0) != MVT::i32))
15259 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15260 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15262 return DAG.getBitcast(MVT::f32, Extract);
15265 if (VT == MVT::i32 || VT == MVT::i64) {
15266 // ExtractPS/pextrq works with constant index.
15267 if (isa<ConstantSDNode>(Op.getOperand(1)))
15274 /// Extract one bit from mask vector, like v16i1 or v8i1.
15275 /// AVX-512 feature.
15276 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15277 const X86Subtarget &Subtarget) {
15278 SDValue Vec = Op.getOperand(0);
15280 MVT VecVT = Vec.getSimpleValueType();
15281 SDValue Idx = Op.getOperand(1);
15282 MVT EltVT = Op.getSimpleValueType();
15284 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15285 "Unexpected vector type in ExtractBitFromMaskVector");
15287 // variable index can't be handled in mask registers,
15288 // extend vector to VR512/128
15289 if (!isa<ConstantSDNode>(Idx)) {
15290 unsigned NumElts = VecVT.getVectorNumElements();
15291 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15292 // than extending to 128/256bit.
15293 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15294 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15295 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15296 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15297 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15300 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15302 // If the kshift instructions of the correct width aren't natively supported
15303 // then we need to promote the vector to the native size to get the correct
15304 // zeroing behavior.
15305 if (VecVT.getVectorNumElements() < 16) {
15306 VecVT = MVT::v16i1;
15307 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15308 DAG.getUNDEF(VecVT), Vec,
15309 DAG.getIntPtrConstant(0, dl));
15312 // Extracts from element 0 are always allowed.
15314 // Use kshiftr instruction to move to the lower element.
15315 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15316 DAG.getConstant(IdxVal, dl, MVT::i8));
15319 // Shrink to v16i1 since that's always legal.
15320 if (VecVT.getVectorNumElements() > 16) {
15321 VecVT = MVT::v16i1;
15322 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15323 DAG.getIntPtrConstant(0, dl));
15326 // Convert to a bitcast+aext/trunc.
15327 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15328 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15332 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15333 SelectionDAG &DAG) const {
15335 SDValue Vec = Op.getOperand(0);
15336 MVT VecVT = Vec.getSimpleValueType();
15337 SDValue Idx = Op.getOperand(1);
15339 if (VecVT.getVectorElementType() == MVT::i1)
15340 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15342 if (!isa<ConstantSDNode>(Idx)) {
15343 // Its more profitable to go through memory (1 cycles throughput)
15344 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15345 // IACA tool was used to get performance estimation
15346 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15348 // example : extractelement <16 x i8> %a, i32 %i
15350 // Block Throughput: 3.00 Cycles
15351 // Throughput Bottleneck: Port5
15353 // | Num Of | Ports pressure in cycles | |
15354 // | Uops | 0 - DV | 5 | 6 | 7 | |
15355 // ---------------------------------------------
15356 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15357 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15358 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15359 // Total Num Of Uops: 4
15362 // Block Throughput: 1.00 Cycles
15363 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15365 // | | Ports pressure in cycles | |
15366 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15367 // ---------------------------------------------------------
15368 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15369 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15370 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15371 // Total Num Of Uops: 4
15376 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15378 // If this is a 256-bit vector result, first extract the 128-bit vector and
15379 // then extract the element from the 128-bit vector.
15380 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15381 // Get the 128-bit vector.
15382 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15383 MVT EltVT = VecVT.getVectorElementType();
15385 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15386 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15388 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15389 // this can be done with a mask.
15390 IdxVal &= ElemsPerChunk - 1;
15391 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15392 DAG.getConstant(IdxVal, dl, MVT::i32));
15395 assert(VecVT.is128BitVector() && "Unexpected vector length");
15397 MVT VT = Op.getSimpleValueType();
15399 if (VT.getSizeInBits() == 16) {
15400 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15401 // we're going to zero extend the register or fold the store (SSE41 only).
15402 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15403 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15404 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15405 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15406 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15408 // Transform it so it match pextrw which produces a 32-bit result.
15409 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15410 Op.getOperand(0), Op.getOperand(1));
15411 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15414 if (Subtarget.hasSSE41())
15415 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15418 // TODO: We only extract a single element from v16i8, we can probably afford
15419 // to be more aggressive here before using the default approach of spilling to
15421 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15422 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15423 int DWordIdx = IdxVal / 4;
15424 if (DWordIdx == 0) {
15425 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15426 DAG.getBitcast(MVT::v4i32, Vec),
15427 DAG.getIntPtrConstant(DWordIdx, dl));
15428 int ShiftVal = (IdxVal % 4) * 8;
15430 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15431 DAG.getConstant(ShiftVal, dl, MVT::i8));
15432 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15435 int WordIdx = IdxVal / 2;
15436 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15437 DAG.getBitcast(MVT::v8i16, Vec),
15438 DAG.getIntPtrConstant(WordIdx, dl));
15439 int ShiftVal = (IdxVal % 2) * 8;
15441 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15442 DAG.getConstant(ShiftVal, dl, MVT::i8));
15443 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15446 if (VT.getSizeInBits() == 32) {
15450 // SHUFPS the element to the lowest double word, then movss.
15451 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15452 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15454 DAG.getIntPtrConstant(0, dl));
15457 if (VT.getSizeInBits() == 64) {
15458 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15459 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15460 // to match extract_elt for f64.
15464 // UNPCKHPD the element to the lowest double word, then movsd.
15465 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15466 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15467 int Mask[2] = { 1, -1 };
15468 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15469 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15470 DAG.getIntPtrConstant(0, dl));
15476 /// Insert one bit to mask vector, like v16i1 or v8i1.
15477 /// AVX-512 feature.
15478 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15479 const X86Subtarget &Subtarget) {
15481 SDValue Vec = Op.getOperand(0);
15482 SDValue Elt = Op.getOperand(1);
15483 SDValue Idx = Op.getOperand(2);
15484 MVT VecVT = Vec.getSimpleValueType();
15486 if (!isa<ConstantSDNode>(Idx)) {
15487 // Non constant index. Extend source and destination,
15488 // insert element and then truncate the result.
15489 unsigned NumElts = VecVT.getVectorNumElements();
15490 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15491 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15492 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15493 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15494 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15495 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15498 // Copy into a k-register, extract to v1i1 and insert_subvector.
15499 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15501 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15505 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15506 SelectionDAG &DAG) const {
15507 MVT VT = Op.getSimpleValueType();
15508 MVT EltVT = VT.getVectorElementType();
15509 unsigned NumElts = VT.getVectorNumElements();
15511 if (EltVT == MVT::i1)
15512 return InsertBitToMaskVector(Op, DAG, Subtarget);
15515 SDValue N0 = Op.getOperand(0);
15516 SDValue N1 = Op.getOperand(1);
15517 SDValue N2 = Op.getOperand(2);
15518 if (!isa<ConstantSDNode>(N2))
15520 auto *N2C = cast<ConstantSDNode>(N2);
15521 unsigned IdxVal = N2C->getZExtValue();
15523 bool IsZeroElt = X86::isZeroNode(N1);
15524 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15526 // If we are inserting a element, see if we can do this more efficiently with
15527 // a blend shuffle with a rematerializable vector than a costly integer
15529 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15530 16 <= EltVT.getSizeInBits()) {
15531 SmallVector<int, 8> BlendMask;
15532 for (unsigned i = 0; i != NumElts; ++i)
15533 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15534 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15535 : getOnesVector(VT, DAG, dl);
15536 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15539 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15540 // into that, and then insert the subvector back into the result.
15541 if (VT.is256BitVector() || VT.is512BitVector()) {
15542 // With a 256-bit vector, we can insert into the zero element efficiently
15543 // using a blend if we have AVX or AVX2 and the right data type.
15544 if (VT.is256BitVector() && IdxVal == 0) {
15545 // TODO: It is worthwhile to cast integer to floating point and back
15546 // and incur a domain crossing penalty if that's what we'll end up
15547 // doing anyway after extracting to a 128-bit vector.
15548 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15549 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15550 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15551 N2 = DAG.getIntPtrConstant(1, dl);
15552 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15556 // Get the desired 128-bit vector chunk.
15557 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15559 // Insert the element into the desired chunk.
15560 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15561 assert(isPowerOf2_32(NumEltsIn128));
15562 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15563 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15565 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15566 DAG.getConstant(IdxIn128, dl, MVT::i32));
15568 // Insert the changed part back into the bigger vector
15569 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15571 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15573 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15574 // argument. SSE41 required for pinsrb.
15575 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15577 if (VT == MVT::v8i16) {
15578 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15579 Opc = X86ISD::PINSRW;
15581 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15582 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15583 Opc = X86ISD::PINSRB;
15586 if (N1.getValueType() != MVT::i32)
15587 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15588 if (N2.getValueType() != MVT::i32)
15589 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15590 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15593 if (Subtarget.hasSSE41()) {
15594 if (EltVT == MVT::f32) {
15595 // Bits [7:6] of the constant are the source select. This will always be
15596 // zero here. The DAG Combiner may combine an extract_elt index into
15597 // these bits. For example (insert (extract, 3), 2) could be matched by
15598 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15599 // Bits [5:4] of the constant are the destination select. This is the
15600 // value of the incoming immediate.
15601 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15602 // combine either bitwise AND or insert of float 0.0 to set these bits.
15604 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15605 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15606 // If this is an insertion of 32-bits into the low 32-bits of
15607 // a vector, we prefer to generate a blend with immediate rather
15608 // than an insertps. Blends are simpler operations in hardware and so
15609 // will always have equal or better performance than insertps.
15610 // But if optimizing for size and there's a load folding opportunity,
15611 // generate insertps because blendps does not have a 32-bit memory
15613 N2 = DAG.getIntPtrConstant(1, dl);
15614 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15615 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15617 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15618 // Create this as a scalar to vector..
15619 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15620 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15623 // PINSR* works with constant index.
15624 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15631 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15632 SelectionDAG &DAG) {
15634 MVT OpVT = Op.getSimpleValueType();
15636 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15638 if (X86::isZeroNode(Op.getOperand(0)))
15639 return getZeroVector(OpVT, Subtarget, DAG, dl);
15641 // If this is a 256-bit vector result, first insert into a 128-bit
15642 // vector and then insert into the 256-bit vector.
15643 if (!OpVT.is128BitVector()) {
15644 // Insert into a 128-bit vector.
15645 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15646 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15647 OpVT.getVectorNumElements() / SizeFactor);
15649 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15651 // Insert the 128-bit vector.
15652 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15654 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15656 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15657 if (OpVT == MVT::v4i32)
15660 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15661 return DAG.getBitcast(
15662 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15665 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15666 // simple superregister reference or explicit instructions to insert
15667 // the upper bits of a vector.
15668 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15669 SelectionDAG &DAG) {
15670 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15672 return insert1BitVector(Op, DAG, Subtarget);
15675 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15676 SelectionDAG &DAG) {
15677 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15678 "Only vXi1 extract_subvectors need custom lowering");
15681 SDValue Vec = Op.getOperand(0);
15682 SDValue Idx = Op.getOperand(1);
15684 if (!isa<ConstantSDNode>(Idx))
15687 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15688 if (IdxVal == 0) // the operation is legal
15691 MVT VecVT = Vec.getSimpleValueType();
15692 unsigned NumElems = VecVT.getVectorNumElements();
15694 // Extend to natively supported kshift.
15695 MVT WideVecVT = VecVT;
15696 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15697 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15698 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15699 DAG.getUNDEF(WideVecVT), Vec,
15700 DAG.getIntPtrConstant(0, dl));
15703 // Shift to the LSB.
15704 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15705 DAG.getConstant(IdxVal, dl, MVT::i8));
15707 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15708 DAG.getIntPtrConstant(0, dl));
15711 // Returns the appropriate wrapper opcode for a global reference.
15712 unsigned X86TargetLowering::getGlobalWrapperKind(
15713 const GlobalValue *GV, const unsigned char OpFlags) const {
15714 // References to absolute symbols are never PC-relative.
15715 if (GV && GV->isAbsoluteSymbolRef())
15716 return X86ISD::Wrapper;
15718 CodeModel::Model M = getTargetMachine().getCodeModel();
15719 if (Subtarget.isPICStyleRIPRel() &&
15720 (M == CodeModel::Small || M == CodeModel::Kernel))
15721 return X86ISD::WrapperRIP;
15723 // GOTPCREL references must always use RIP.
15724 if (OpFlags == X86II::MO_GOTPCREL)
15725 return X86ISD::WrapperRIP;
15727 return X86ISD::Wrapper;
15730 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15731 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15732 // one of the above mentioned nodes. It has to be wrapped because otherwise
15733 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15734 // be used to form addressing mode. These wrapped nodes will be selected
15737 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15738 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15740 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15741 // global base reg.
15742 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15744 auto PtrVT = getPointerTy(DAG.getDataLayout());
15745 SDValue Result = DAG.getTargetConstantPool(
15746 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15748 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15749 // With PIC, the address is actually $g + Offset.
15752 DAG.getNode(ISD::ADD, DL, PtrVT,
15753 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15759 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15760 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15762 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15763 // global base reg.
15764 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15766 auto PtrVT = getPointerTy(DAG.getDataLayout());
15767 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15769 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15771 // With PIC, the address is actually $g + Offset.
15774 DAG.getNode(ISD::ADD, DL, PtrVT,
15775 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15781 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15782 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15784 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15785 // global base reg.
15786 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15787 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15789 auto PtrVT = getPointerTy(DAG.getDataLayout());
15790 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15793 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15795 // With PIC, the address is actually $g + Offset.
15796 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15798 DAG.getNode(ISD::ADD, DL, PtrVT,
15799 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15802 // For symbols that require a load from a stub to get the address, emit the
15804 if (isGlobalStubReference(OpFlag))
15805 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15806 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15812 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15813 // Create the TargetBlockAddressAddress node.
15814 unsigned char OpFlags =
15815 Subtarget.classifyBlockAddressReference();
15816 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15817 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15819 auto PtrVT = getPointerTy(DAG.getDataLayout());
15820 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15821 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15823 // With PIC, the address is actually $g + Offset.
15824 if (isGlobalRelativeToPICBase(OpFlags)) {
15825 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15826 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15832 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15833 const SDLoc &dl, int64_t Offset,
15834 SelectionDAG &DAG) const {
15835 // Create the TargetGlobalAddress node, folding in the constant
15836 // offset if it is legal.
15837 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15838 CodeModel::Model M = DAG.getTarget().getCodeModel();
15839 auto PtrVT = getPointerTy(DAG.getDataLayout());
15841 if (OpFlags == X86II::MO_NO_FLAG &&
15842 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15843 // A direct static reference to a global.
15844 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15847 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15850 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
15852 // With PIC, the address is actually $g + Offset.
15853 if (isGlobalRelativeToPICBase(OpFlags)) {
15854 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15855 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15858 // For globals that require a load from a stub to get the address, emit the
15860 if (isGlobalStubReference(OpFlags))
15861 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15862 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15864 // If there was a non-zero offset that we didn't fold, create an explicit
15865 // addition for it.
15867 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15868 DAG.getConstant(Offset, dl, PtrVT));
15874 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15875 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15876 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15877 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15881 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15882 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15883 unsigned char OperandFlags, bool LocalDynamic = false) {
15884 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15885 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15887 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15888 GA->getValueType(0),
15892 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15896 SDValue Ops[] = { Chain, TGA, *InFlag };
15897 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15899 SDValue Ops[] = { Chain, TGA };
15900 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15903 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15904 MFI.setAdjustsStack(true);
15905 MFI.setHasCalls(true);
15907 SDValue Flag = Chain.getValue(1);
15908 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15911 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15913 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15916 SDLoc dl(GA); // ? function entry point might be better
15917 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15918 DAG.getNode(X86ISD::GlobalBaseReg,
15919 SDLoc(), PtrVT), InFlag);
15920 InFlag = Chain.getValue(1);
15922 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15925 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15927 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15929 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15930 X86::RAX, X86II::MO_TLSGD);
15933 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15939 // Get the start address of the TLS block for this module.
15940 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15941 .getInfo<X86MachineFunctionInfo>();
15942 MFI->incNumLocalDynamicTLSAccesses();
15946 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15947 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15950 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15951 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15952 InFlag = Chain.getValue(1);
15953 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15954 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15957 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15961 unsigned char OperandFlags = X86II::MO_DTPOFF;
15962 unsigned WrapperKind = X86ISD::Wrapper;
15963 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15964 GA->getValueType(0),
15965 GA->getOffset(), OperandFlags);
15966 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15968 // Add x@dtpoff with the base.
15969 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15972 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15973 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15974 const EVT PtrVT, TLSModel::Model model,
15975 bool is64Bit, bool isPIC) {
15978 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15979 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15980 is64Bit ? 257 : 256));
15982 SDValue ThreadPointer =
15983 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15984 MachinePointerInfo(Ptr));
15986 unsigned char OperandFlags = 0;
15987 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15989 unsigned WrapperKind = X86ISD::Wrapper;
15990 if (model == TLSModel::LocalExec) {
15991 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15992 } else if (model == TLSModel::InitialExec) {
15994 OperandFlags = X86II::MO_GOTTPOFF;
15995 WrapperKind = X86ISD::WrapperRIP;
15997 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
16000 llvm_unreachable("Unexpected model");
16003 // emit "addl x@ntpoff,%eax" (local exec)
16004 // or "addl x@indntpoff,%eax" (initial exec)
16005 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
16007 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
16008 GA->getOffset(), OperandFlags);
16009 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
16011 if (model == TLSModel::InitialExec) {
16012 if (isPIC && !is64Bit) {
16013 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
16014 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16018 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
16019 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
16022 // The address of the thread local variable is the add of the thread
16023 // pointer with the offset of the variable.
16024 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
16028 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
16030 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
16032 if (DAG.getTarget().useEmulatedTLS())
16033 return LowerToTLSEmulatedModel(GA, DAG);
16035 const GlobalValue *GV = GA->getGlobal();
16036 auto PtrVT = getPointerTy(DAG.getDataLayout());
16037 bool PositionIndependent = isPositionIndependent();
16039 if (Subtarget.isTargetELF()) {
16040 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
16042 case TLSModel::GeneralDynamic:
16043 if (Subtarget.is64Bit())
16044 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
16045 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
16046 case TLSModel::LocalDynamic:
16047 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
16048 Subtarget.is64Bit());
16049 case TLSModel::InitialExec:
16050 case TLSModel::LocalExec:
16051 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
16052 PositionIndependent);
16054 llvm_unreachable("Unknown TLS model.");
16057 if (Subtarget.isTargetDarwin()) {
16058 // Darwin only has one model of TLS. Lower to that.
16059 unsigned char OpFlag = 0;
16060 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
16061 X86ISD::WrapperRIP : X86ISD::Wrapper;
16063 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16064 // global base reg.
16065 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
16067 OpFlag = X86II::MO_TLVP_PIC_BASE;
16069 OpFlag = X86II::MO_TLVP;
16071 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
16072 GA->getValueType(0),
16073 GA->getOffset(), OpFlag);
16074 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
16076 // With PIC32, the address is actually $g + Offset.
16078 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
16079 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16082 // Lowering the machine isd will make sure everything is in the right
16084 SDValue Chain = DAG.getEntryNode();
16085 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16086 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16087 SDValue Args[] = { Chain, Offset };
16088 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
16089 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
16090 DAG.getIntPtrConstant(0, DL, true),
16091 Chain.getValue(1), DL);
16093 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
16094 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16095 MFI.setAdjustsStack(true);
16097 // And our return value (tls address) is in the standard call return value
16099 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
16100 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
16103 if (Subtarget.isTargetKnownWindowsMSVC() ||
16104 Subtarget.isTargetWindowsItanium() ||
16105 Subtarget.isTargetWindowsGNU()) {
16106 // Just use the implicit TLS architecture
16107 // Need to generate something similar to:
16108 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
16110 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
16111 // mov rcx, qword [rdx+rcx*8]
16112 // mov eax, .tls$:tlsvar
16113 // [rax+rcx] contains the address
16114 // Windows 64bit: gs:0x58
16115 // Windows 32bit: fs:__tls_array
16118 SDValue Chain = DAG.getEntryNode();
16120 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
16121 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
16122 // use its literal value of 0x2C.
16123 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
16124 ? Type::getInt8PtrTy(*DAG.getContext(),
16126 : Type::getInt32PtrTy(*DAG.getContext(),
16129 SDValue TlsArray = Subtarget.is64Bit()
16130 ? DAG.getIntPtrConstant(0x58, dl)
16131 : (Subtarget.isTargetWindowsGNU()
16132 ? DAG.getIntPtrConstant(0x2C, dl)
16133 : DAG.getExternalSymbol("_tls_array", PtrVT));
16135 SDValue ThreadPointer =
16136 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
16139 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
16140 res = ThreadPointer;
16142 // Load the _tls_index variable
16143 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
16144 if (Subtarget.is64Bit())
16145 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
16146 MachinePointerInfo(), MVT::i32);
16148 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
16150 auto &DL = DAG.getDataLayout();
16152 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
16153 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
16155 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
16158 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
16160 // Get the offset of start of .tls section
16161 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16162 GA->getValueType(0),
16163 GA->getOffset(), X86II::MO_SECREL);
16164 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
16166 // The address of the thread local variable is the add of the thread
16167 // pointer with the offset of the variable.
16168 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
16171 llvm_unreachable("TLS not implemented for this target.");
16174 /// Lower SRA_PARTS and friends, which return two i32 values
16175 /// and take a 2 x i32 value to shift plus a shift amount.
16176 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
16177 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
16178 MVT VT = Op.getSimpleValueType();
16179 unsigned VTBits = VT.getSizeInBits();
16181 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
16182 SDValue ShOpLo = Op.getOperand(0);
16183 SDValue ShOpHi = Op.getOperand(1);
16184 SDValue ShAmt = Op.getOperand(2);
16185 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
16186 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
16188 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16189 DAG.getConstant(VTBits - 1, dl, MVT::i8));
16190 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
16191 DAG.getConstant(VTBits - 1, dl, MVT::i8))
16192 : DAG.getConstant(0, dl, VT);
16194 SDValue Tmp2, Tmp3;
16195 if (Op.getOpcode() == ISD::SHL_PARTS) {
16196 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
16197 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
16199 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
16200 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
16203 // If the shift amount is larger or equal than the width of a part we can't
16204 // rely on the results of shld/shrd. Insert a test and select the appropriate
16205 // values for large shift amounts.
16206 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16207 DAG.getConstant(VTBits, dl, MVT::i8));
16208 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
16209 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
16212 if (Op.getOpcode() == ISD::SHL_PARTS) {
16213 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16214 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16216 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16217 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16220 return DAG.getMergeValues({ Lo, Hi }, dl);
16223 // Try to use a packed vector operation to handle i64 on 32-bit targets when
16224 // AVX512DQ is enabled.
16225 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
16226 const X86Subtarget &Subtarget) {
16227 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
16228 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
16229 SDValue Src = Op.getOperand(0);
16230 MVT SrcVT = Src.getSimpleValueType();
16231 MVT VT = Op.getSimpleValueType();
16233 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
16234 (VT != MVT::f32 && VT != MVT::f64))
16237 // Pack the i64 into a vector, do the operation and extract.
16239 // Using 256-bit to ensure result is 128-bits for f32 case.
16240 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
16241 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
16242 MVT VecVT = MVT::getVectorVT(VT, NumElts);
16245 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
16246 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
16247 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
16248 DAG.getIntPtrConstant(0, dl));
16251 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
16252 SelectionDAG &DAG) const {
16253 SDValue Src = Op.getOperand(0);
16254 MVT SrcVT = Src.getSimpleValueType();
16255 MVT VT = Op.getSimpleValueType();
16258 if (SrcVT.isVector()) {
16259 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16260 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16261 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16262 DAG.getUNDEF(SrcVT)));
16267 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16268 "Unknown SINT_TO_FP to lower!");
16270 // These are really Legal; return the operand so the caller accepts it as
16272 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
16274 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
16278 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16281 SDValue ValueToStore = Op.getOperand(0);
16282 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
16283 !Subtarget.is64Bit())
16284 // Bitcasting to f64 here allows us to do a single 64-bit store from
16285 // an SSE register, avoiding the store forwarding penalty that would come
16286 // with two 32-bit stores.
16287 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16289 unsigned Size = SrcVT.getSizeInBits()/8;
16290 MachineFunction &MF = DAG.getMachineFunction();
16291 auto PtrVT = getPointerTy(MF.getDataLayout());
16292 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16293 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16294 SDValue Chain = DAG.getStore(
16295 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16296 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16297 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16300 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16302 SelectionDAG &DAG) const {
16306 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16308 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16310 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16312 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16314 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16315 MachineMemOperand *MMO;
16317 int SSFI = FI->getIndex();
16318 MMO = DAG.getMachineFunction().getMachineMemOperand(
16319 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16320 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16322 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16323 StackSlot = StackSlot.getOperand(1);
16325 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16326 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16328 Tys, Ops, SrcVT, MMO);
16331 Chain = Result.getValue(1);
16332 SDValue InFlag = Result.getValue(2);
16334 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16335 // shouldn't be necessary except that RFP cannot be live across
16336 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16337 MachineFunction &MF = DAG.getMachineFunction();
16338 unsigned SSFISize = Op.getValueSizeInBits()/8;
16339 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16340 auto PtrVT = getPointerTy(MF.getDataLayout());
16341 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16342 Tys = DAG.getVTList(MVT::Other);
16344 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16346 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16347 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16348 MachineMemOperand::MOStore, SSFISize, SSFISize);
16350 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16351 Ops, Op.getValueType(), MMO);
16352 Result = DAG.getLoad(
16353 Op.getValueType(), DL, Chain, StackSlot,
16354 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16360 /// 64-bit unsigned integer to double expansion.
16361 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16362 const X86Subtarget &Subtarget) {
16363 // This algorithm is not obvious. Here it is what we're trying to output:
16366 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16367 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16369 haddpd %xmm0, %xmm0
16371 pshufd $0x4e, %xmm0, %xmm1
16377 LLVMContext *Context = DAG.getContext();
16379 // Build some magic constants.
16380 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16381 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16382 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16383 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16385 SmallVector<Constant*,2> CV1;
16387 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16388 APInt(64, 0x4330000000000000ULL))));
16390 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16391 APInt(64, 0x4530000000000000ULL))));
16392 Constant *C1 = ConstantVector::get(CV1);
16393 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16395 // Load the 64-bit value into an XMM register.
16396 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16399 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16400 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16401 /* Alignment = */ 16);
16403 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16406 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16407 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16408 /* Alignment = */ 16);
16409 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16410 // TODO: Are there any fast-math-flags to propagate here?
16411 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16414 if (Subtarget.hasSSE3()) {
16415 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16416 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16418 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16419 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16420 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16421 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16424 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16425 DAG.getIntPtrConstant(0, dl));
16428 /// 32-bit unsigned integer to float expansion.
16429 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16430 const X86Subtarget &Subtarget) {
16432 // FP constant to bias correct the final result.
16433 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16436 // Load the 32-bit value into an XMM register.
16437 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16440 // Zero out the upper parts of the register.
16441 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16443 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16444 DAG.getBitcast(MVT::v2f64, Load),
16445 DAG.getIntPtrConstant(0, dl));
16447 // Or the load with the bias.
16448 SDValue Or = DAG.getNode(
16449 ISD::OR, dl, MVT::v2i64,
16450 DAG.getBitcast(MVT::v2i64,
16451 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16452 DAG.getBitcast(MVT::v2i64,
16453 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16455 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16456 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16458 // Subtract the bias.
16459 // TODO: Are there any fast-math-flags to propagate here?
16460 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16462 // Handle final rounding.
16463 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16466 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16467 const X86Subtarget &Subtarget,
16469 if (Op.getSimpleValueType() != MVT::v2f64)
16472 SDValue N0 = Op.getOperand(0);
16473 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16475 // Legalize to v4i32 type.
16476 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16477 DAG.getUNDEF(MVT::v2i32));
16479 if (Subtarget.hasAVX512())
16480 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16482 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16483 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16484 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16485 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16487 // Two to the power of half-word-size.
16488 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16490 // Clear upper part of LO, lower HI.
16491 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16492 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16494 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16495 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16496 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16498 // Add the two halves.
16499 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16502 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16503 const X86Subtarget &Subtarget) {
16504 // The algorithm is the following:
16505 // #ifdef __SSE4_1__
16506 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16507 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16508 // (uint4) 0x53000000, 0xaa);
16510 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16511 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16513 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16514 // return (float4) lo + fhi;
16516 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16517 // reassociate the two FADDs, and if we do that, the algorithm fails
16518 // spectacularly (PR24512).
16519 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16520 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16521 // there's also the MachineCombiner reassociations happening on Machine IR.
16522 if (DAG.getTarget().Options.UnsafeFPMath)
16526 SDValue V = Op->getOperand(0);
16527 MVT VecIntVT = V.getSimpleValueType();
16528 bool Is128 = VecIntVT == MVT::v4i32;
16529 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16530 // If we convert to something else than the supported type, e.g., to v4f64,
16532 if (VecFloatVT != Op->getSimpleValueType(0))
16535 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16536 "Unsupported custom type");
16538 // In the #idef/#else code, we have in common:
16539 // - The vector of constants:
16545 // Create the splat vector for 0x4b000000.
16546 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16547 // Create the splat vector for 0x53000000.
16548 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16550 // Create the right shift.
16551 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16552 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16555 if (Subtarget.hasSSE41()) {
16556 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16557 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16558 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16559 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16560 // Low will be bitcasted right away, so do not bother bitcasting back to its
16562 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16563 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16564 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16565 // (uint4) 0x53000000, 0xaa);
16566 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16567 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16568 // High will be bitcasted right away, so do not bother bitcasting back to
16569 // its original type.
16570 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16571 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16573 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16574 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16575 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16576 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16578 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16579 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16582 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16583 SDValue VecCstFAdd = DAG.getConstantFP(
16584 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16586 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16587 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16588 // TODO: Are there any fast-math-flags to propagate here?
16590 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16591 // return (float4) lo + fhi;
16592 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16593 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16596 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16597 const X86Subtarget &Subtarget) {
16598 SDValue N0 = Op.getOperand(0);
16599 MVT SrcVT = N0.getSimpleValueType();
16602 switch (SrcVT.SimpleTy) {
16604 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16606 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16609 assert(!Subtarget.hasAVX512());
16610 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16614 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16615 SelectionDAG &DAG) const {
16616 SDValue N0 = Op.getOperand(0);
16618 auto PtrVT = getPointerTy(DAG.getDataLayout());
16620 if (Op.getSimpleValueType().isVector())
16621 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16623 MVT SrcVT = N0.getSimpleValueType();
16624 MVT DstVT = Op.getSimpleValueType();
16626 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16627 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16628 // Conversions from unsigned i32 to f32/f64 are legal,
16629 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16633 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16636 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16637 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16638 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16639 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16640 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16643 // Make a 64-bit buffer, and use it to build an FILD.
16644 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16645 if (SrcVT == MVT::i32) {
16646 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16647 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16648 StackSlot, MachinePointerInfo());
16649 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16650 OffsetSlot, MachinePointerInfo());
16651 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16655 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16656 SDValue ValueToStore = Op.getOperand(0);
16657 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16658 // Bitcasting to f64 here allows us to do a single 64-bit store from
16659 // an SSE register, avoiding the store forwarding penalty that would come
16660 // with two 32-bit stores.
16661 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16662 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16663 MachinePointerInfo());
16664 // For i64 source, we need to add the appropriate power of 2 if the input
16665 // was negative. This is the same as the optimization in
16666 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16667 // we must be careful to do the computation in x87 extended precision, not
16668 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16669 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16670 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16671 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16672 MachineMemOperand::MOLoad, 8, 8);
16674 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16675 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16676 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16679 APInt FF(32, 0x5F800000ULL);
16681 // Check whether the sign bit is set.
16682 SDValue SignSet = DAG.getSetCC(
16683 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16684 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16686 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16687 SDValue FudgePtr = DAG.getConstantPool(
16688 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16690 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16691 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16692 SDValue Four = DAG.getIntPtrConstant(4, dl);
16693 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16694 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16696 // Load the value out, extending it from f32 to f80.
16697 // FIXME: Avoid the extend by constructing the right constant pool?
16698 SDValue Fudge = DAG.getExtLoad(
16699 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16700 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16701 /* Alignment = */ 4);
16702 // Extend everything to 80 bits to force it to be done on x87.
16703 // TODO: Are there any fast-math-flags to propagate here?
16704 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16705 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16706 DAG.getIntPtrConstant(0, dl));
16709 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16710 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16711 // just return an <SDValue(), SDValue()> pair.
16712 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16713 // to i16, i32 or i64, and we lower it to a legal sequence.
16714 // If lowered to the final integer result we return a <result, SDValue()> pair.
16715 // Otherwise we lower it to a sequence ending with a FIST, return a
16716 // <FIST, StackSlot> pair, and the caller is responsible for loading
16717 // the final integer result from StackSlot.
16718 std::pair<SDValue,SDValue>
16719 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16720 bool IsSigned, bool IsReplace) const {
16723 EVT DstTy = Op.getValueType();
16724 EVT TheVT = Op.getOperand(0).getValueType();
16725 auto PtrVT = getPointerTy(DAG.getDataLayout());
16727 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16728 // f16 must be promoted before using the lowering in this routine.
16729 // fp128 does not use this lowering.
16730 return std::make_pair(SDValue(), SDValue());
16733 // If using FIST to compute an unsigned i64, we'll need some fixup
16734 // to handle values above the maximum signed i64. A FIST is always
16735 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16736 bool UnsignedFixup = !IsSigned &&
16737 DstTy == MVT::i64 &&
16738 (!Subtarget.is64Bit() ||
16739 !isScalarFPTypeInSSEReg(TheVT));
16741 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16742 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16743 // The low 32 bits of the fist result will have the correct uint32 result.
16744 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16748 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16749 DstTy.getSimpleVT() >= MVT::i16 &&
16750 "Unknown FP_TO_INT to lower!");
16752 // These are really Legal.
16753 if (DstTy == MVT::i32 &&
16754 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16755 return std::make_pair(SDValue(), SDValue());
16756 if (Subtarget.is64Bit() &&
16757 DstTy == MVT::i64 &&
16758 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16759 return std::make_pair(SDValue(), SDValue());
16761 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16763 MachineFunction &MF = DAG.getMachineFunction();
16764 unsigned MemSize = DstTy.getSizeInBits()/8;
16765 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16766 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16769 switch (DstTy.getSimpleVT().SimpleTy) {
16770 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16771 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16772 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16773 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16776 SDValue Chain = DAG.getEntryNode();
16777 SDValue Value = Op.getOperand(0);
16778 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16780 if (UnsignedFixup) {
16782 // Conversion to unsigned i64 is implemented with a select,
16783 // depending on whether the source value fits in the range
16784 // of a signed i64. Let Thresh be the FP equivalent of
16785 // 0x8000000000000000ULL.
16787 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16788 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16789 // Fist-to-mem64 FistSrc
16790 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16791 // to XOR'ing the high 32 bits with Adjust.
16793 // Being a power of 2, Thresh is exactly representable in all FP formats.
16794 // For X87 we'd like to use the smallest FP type for this constant, but
16795 // for DAG type consistency we have to match the FP operand type.
16797 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16798 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16799 bool LosesInfo = false;
16800 if (TheVT == MVT::f64)
16801 // The rounding mode is irrelevant as the conversion should be exact.
16802 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16804 else if (TheVT == MVT::f80)
16805 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16806 APFloat::rmNearestTiesToEven, &LosesInfo);
16808 assert(Status == APFloat::opOK && !LosesInfo &&
16809 "FP conversion should have been exact");
16811 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16813 SDValue Cmp = DAG.getSetCC(DL,
16814 getSetCCResultType(DAG.getDataLayout(),
16815 *DAG.getContext(), TheVT),
16816 Value, ThreshVal, ISD::SETLT);
16817 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16818 DAG.getConstant(0, DL, MVT::i32),
16819 DAG.getConstant(0x80000000, DL, MVT::i32));
16820 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16821 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16822 *DAG.getContext(), TheVT),
16823 Value, ThreshVal, ISD::SETLT);
16824 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16827 // FIXME This causes a redundant load/store if the SSE-class value is already
16828 // in memory, such as if it is on the callstack.
16829 if (isScalarFPTypeInSSEReg(TheVT)) {
16830 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16831 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16832 MachinePointerInfo::getFixedStack(MF, SSFI));
16833 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16835 Chain, StackSlot, DAG.getValueType(TheVT)
16838 MachineMemOperand *MMO =
16839 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16840 MachineMemOperand::MOLoad, MemSize, MemSize);
16841 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16842 Chain = Value.getValue(1);
16843 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16844 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16847 MachineMemOperand *MMO =
16848 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16849 MachineMemOperand::MOStore, MemSize, MemSize);
16851 if (UnsignedFixup) {
16853 // Insert the FIST, load its result as two i32's,
16854 // and XOR the high i32 with Adjust.
16856 SDValue FistOps[] = { Chain, Value, StackSlot };
16857 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16858 FistOps, DstTy, MMO);
16861 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16862 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16865 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16866 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16868 if (Subtarget.is64Bit()) {
16869 // Join High32 and Low32 into a 64-bit result.
16870 // (High32 << 32) | Low32
16871 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16872 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16873 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16874 DAG.getConstant(32, DL, MVT::i8));
16875 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16876 return std::make_pair(Result, SDValue());
16879 SDValue ResultOps[] = { Low32, High32 };
16881 SDValue pair = IsReplace
16882 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16883 : DAG.getMergeValues(ResultOps, DL);
16884 return std::make_pair(pair, SDValue());
16886 // Build the FP_TO_INT*_IN_MEM
16887 SDValue Ops[] = { Chain, Value, StackSlot };
16888 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16890 return std::make_pair(FIST, StackSlot);
16894 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16895 const X86Subtarget &Subtarget) {
16896 MVT VT = Op->getSimpleValueType(0);
16897 SDValue In = Op->getOperand(0);
16898 MVT InVT = In.getSimpleValueType();
16901 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16902 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16903 "Expected same number of elements");
16904 assert((VT.getVectorElementType() == MVT::i16 ||
16905 VT.getVectorElementType() == MVT::i32 ||
16906 VT.getVectorElementType() == MVT::i64) &&
16907 "Unexpected element type");
16908 assert((InVT.getVectorElementType() == MVT::i8 ||
16909 InVT.getVectorElementType() == MVT::i16 ||
16910 InVT.getVectorElementType() == MVT::i32) &&
16911 "Unexpected element type");
16913 if (Subtarget.hasInt256())
16914 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16916 // Optimize vectors in AVX mode:
16919 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16920 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16921 // Concat upper and lower parts.
16924 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16925 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16926 // Concat upper and lower parts.
16929 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16930 SDValue Undef = DAG.getUNDEF(InVT);
16931 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16932 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16933 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16935 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16936 VT.getVectorNumElements()/2);
16938 OpLo = DAG.getBitcast(HVT, OpLo);
16939 OpHi = DAG.getBitcast(HVT, OpHi);
16941 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16944 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16945 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16946 const SDLoc &dl, SelectionDAG &DAG) {
16947 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16948 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16949 DAG.getIntPtrConstant(0, dl));
16950 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16951 DAG.getIntPtrConstant(8, dl));
16952 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16953 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16954 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16955 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16958 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16959 const X86Subtarget &Subtarget,
16960 SelectionDAG &DAG) {
16961 MVT VT = Op->getSimpleValueType(0);
16962 SDValue In = Op->getOperand(0);
16963 MVT InVT = In.getSimpleValueType();
16964 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16966 unsigned NumElts = VT.getVectorNumElements();
16968 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16969 // avoids a constant pool load.
16970 if (VT.getVectorElementType() != MVT::i8) {
16971 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16972 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16973 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16976 // Extend VT if BWI is not supported.
16978 if (!Subtarget.hasBWI()) {
16979 // If v16i32 is to be avoided, we'll need to split and concatenate.
16980 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16981 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16983 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16986 // Widen to 512-bits if VLX is not supported.
16987 MVT WideVT = ExtVT;
16988 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16989 NumElts *= 512 / ExtVT.getSizeInBits();
16990 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16991 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16992 In, DAG.getIntPtrConstant(0, DL));
16993 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16997 SDValue One = DAG.getConstant(1, DL, WideVT);
16998 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
17000 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
17002 // Truncate if we had to extend above.
17004 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
17005 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
17008 // Extract back to 128/256-bit if we widened.
17010 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
17011 DAG.getIntPtrConstant(0, DL));
17013 return SelectedVal;
17016 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17017 SelectionDAG &DAG) {
17018 SDValue In = Op.getOperand(0);
17019 MVT SVT = In.getSimpleValueType();
17021 if (SVT.getVectorElementType() == MVT::i1)
17022 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
17024 assert(Subtarget.hasAVX() && "Expected AVX support");
17025 return LowerAVXExtend(Op, DAG, Subtarget);
17028 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
17029 /// It makes use of the fact that vectors with enough leading sign/zero bits
17030 /// prevent the PACKSS/PACKUS from saturating the results.
17031 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
17032 /// within each 128-bit lane.
17033 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
17034 const SDLoc &DL, SelectionDAG &DAG,
17035 const X86Subtarget &Subtarget) {
17036 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
17037 "Unexpected PACK opcode");
17039 // Requires SSE2 but AVX512 has fast vector truncate.
17040 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
17043 EVT SrcVT = In.getValueType();
17045 // No truncation required, we might get here due to recursive calls.
17046 if (SrcVT == DstVT)
17049 // We only support vector truncation to 64bits or greater from a
17050 // 128bits or greater source.
17051 unsigned DstSizeInBits = DstVT.getSizeInBits();
17052 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
17053 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
17056 unsigned NumElems = SrcVT.getVectorNumElements();
17057 if (!isPowerOf2_32(NumElems))
17060 LLVMContext &Ctx = *DAG.getContext();
17061 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
17062 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
17064 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
17066 // Pack to the largest type possible:
17067 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
17068 EVT InVT = MVT::i16, OutVT = MVT::i8;
17069 if (SrcVT.getScalarSizeInBits() > 16 &&
17070 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
17075 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
17076 if (SrcVT.is128BitVector()) {
17077 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
17078 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
17079 In = DAG.getBitcast(InVT, In);
17080 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
17081 Res = extractSubVector(Res, 0, DAG, DL, 64);
17082 return DAG.getBitcast(DstVT, Res);
17085 // Extract lower/upper subvectors.
17086 unsigned NumSubElts = NumElems / 2;
17087 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17088 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17090 unsigned SubSizeInBits = SrcSizeInBits / 2;
17091 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
17092 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
17094 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
17095 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
17096 Lo = DAG.getBitcast(InVT, Lo);
17097 Hi = DAG.getBitcast(InVT, Hi);
17098 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17099 return DAG.getBitcast(DstVT, Res);
17102 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
17103 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
17104 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
17105 Lo = DAG.getBitcast(InVT, Lo);
17106 Hi = DAG.getBitcast(InVT, Hi);
17107 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17109 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
17110 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
17111 Res = DAG.getBitcast(MVT::v4i64, Res);
17112 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
17114 if (DstVT.is256BitVector())
17115 return DAG.getBitcast(DstVT, Res);
17117 // If 512bit -> 128bit truncate another stage.
17118 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17119 Res = DAG.getBitcast(PackedVT, Res);
17120 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17123 // Recursively pack lower/upper subvectors, concat result and pack again.
17124 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
17125 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
17126 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
17127 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
17129 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17130 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
17131 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17134 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
17135 const X86Subtarget &Subtarget) {
17138 MVT VT = Op.getSimpleValueType();
17139 SDValue In = Op.getOperand(0);
17140 MVT InVT = In.getSimpleValueType();
17142 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
17144 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
17145 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
17146 if (InVT.getScalarSizeInBits() <= 16) {
17147 if (Subtarget.hasBWI()) {
17148 // legal, will go to VPMOVB2M, VPMOVW2M
17149 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17150 // We need to shift to get the lsb into sign position.
17151 // Shift packed bytes not supported natively, bitcast to word
17152 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
17153 In = DAG.getNode(ISD::SHL, DL, ExtVT,
17154 DAG.getBitcast(ExtVT, In),
17155 DAG.getConstant(ShiftInx, DL, ExtVT));
17156 In = DAG.getBitcast(InVT, In);
17158 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17161 // Use TESTD/Q, extended vector to packed dword/qword.
17162 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
17163 "Unexpected vector type.");
17164 unsigned NumElts = InVT.getVectorNumElements();
17165 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
17166 // We need to change to a wider element type that we have support for.
17167 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
17168 // For 16 element vectors we extend to v16i32 unless we are explicitly
17169 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
17170 // we need to split into two 8 element vectors which we can extend to v8i32,
17171 // truncate and concat the results. There's an additional complication if
17172 // the original type is v16i8. In that case we can't split the v16i8 so
17173 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
17174 // to v8i32, truncate that to v8i1 and concat the two halves.
17175 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
17176 if (InVT == MVT::v16i8) {
17177 // First we need to sign extend up to 256-bits so we can split that.
17178 InVT = MVT::v16i16;
17179 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
17181 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
17182 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
17183 // We're split now, just emit two truncates and a concat. The two
17184 // truncates will trigger legalization to come back to this function.
17185 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
17186 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
17187 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17189 // We either have 8 elements or we're allowed to use 512-bit vectors.
17190 // If we have VLX, we want to use the narrowest vector that can get the
17191 // job done so we use vXi32.
17192 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
17193 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
17194 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
17196 ShiftInx = InVT.getScalarSizeInBits() - 1;
17199 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17200 // We need to shift to get the lsb into sign position.
17201 In = DAG.getNode(ISD::SHL, DL, InVT, In,
17202 DAG.getConstant(ShiftInx, DL, InVT));
17204 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
17205 if (Subtarget.hasDQI())
17206 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17208 return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
17212 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
17214 MVT VT = Op.getSimpleValueType();
17215 SDValue In = Op.getOperand(0);
17216 MVT InVT = In.getSimpleValueType();
17217 unsigned InNumEltBits = InVT.getScalarSizeInBits();
17219 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
17220 "Invalid TRUNCATE operation");
17222 if (VT.getVectorElementType() == MVT::i1)
17223 return LowerTruncateVecI1(Op, DAG, Subtarget);
17225 // vpmovqb/w/d, vpmovdb/w, vpmovwb
17226 if (Subtarget.hasAVX512()) {
17227 // word to byte only under BWI
17228 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
17229 // Make sure we're allowed to promote 512-bits.
17230 if (Subtarget.canExtendTo512DQ())
17231 return DAG.getNode(ISD::TRUNCATE, DL, VT,
17232 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
17238 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
17239 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
17241 // Truncate with PACKUS if we are truncating a vector with leading zero bits
17242 // that extend all the way to the packed/truncated value.
17243 // Pre-SSE41 we can only use PACKUSWB.
17245 DAG.computeKnownBits(In, Known);
17246 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
17248 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
17251 // Truncate with PACKSS if we are truncating a vector with sign-bits that
17252 // extend all the way to the packed/truncated value.
17253 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
17255 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
17258 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
17259 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
17260 if (Subtarget.hasInt256()) {
17261 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17262 In = DAG.getBitcast(MVT::v8i32, In);
17263 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17264 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17265 DAG.getIntPtrConstant(0, DL));
17268 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17269 DAG.getIntPtrConstant(0, DL));
17270 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17271 DAG.getIntPtrConstant(2, DL));
17272 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17273 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17274 static const int ShufMask[] = {0, 2, 4, 6};
17275 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17278 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17279 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17280 if (Subtarget.hasInt256()) {
17281 In = DAG.getBitcast(MVT::v32i8, In);
17283 // The PSHUFB mask:
17284 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17285 -1, -1, -1, -1, -1, -1, -1, -1,
17286 16, 17, 20, 21, 24, 25, 28, 29,
17287 -1, -1, -1, -1, -1, -1, -1, -1 };
17288 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17289 In = DAG.getBitcast(MVT::v4i64, In);
17291 static const int ShufMask2[] = {0, 2, -1, -1};
17292 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17293 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17294 DAG.getIntPtrConstant(0, DL));
17295 return DAG.getBitcast(VT, In);
17298 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17299 DAG.getIntPtrConstant(0, DL));
17301 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17302 DAG.getIntPtrConstant(4, DL));
17304 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17305 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17307 // The PSHUFB mask:
17308 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17309 -1, -1, -1, -1, -1, -1, -1, -1};
17311 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17312 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17314 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17315 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17317 // The MOVLHPS Mask:
17318 static const int ShufMask2[] = {0, 1, 4, 5};
17319 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17320 return DAG.getBitcast(MVT::v8i16, res);
17323 // Handle truncation of V256 to V128 using shuffles.
17324 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17326 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17328 unsigned NumElems = VT.getVectorNumElements();
17329 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17331 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17332 // Prepare truncation shuffle mask
17333 for (unsigned i = 0; i != NumElems; ++i)
17334 MaskVec[i] = i * 2;
17335 In = DAG.getBitcast(NVT, In);
17336 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17338 DAG.getIntPtrConstant(0, DL));
17341 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17342 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17343 MVT VT = Op.getSimpleValueType();
17345 if (VT.isVector()) {
17346 SDValue Src = Op.getOperand(0);
17349 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17350 MVT ResVT = MVT::v4i32;
17351 MVT TruncVT = MVT::v4i1;
17352 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17353 if (!IsSigned && !Subtarget.hasVLX()) {
17354 // Widen to 512-bits.
17355 ResVT = MVT::v8i32;
17356 TruncVT = MVT::v8i1;
17357 Opc = ISD::FP_TO_UINT;
17358 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17359 DAG.getUNDEF(MVT::v8f64),
17360 Src, DAG.getIntPtrConstant(0, dl));
17362 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17363 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17364 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17365 DAG.getIntPtrConstant(0, dl));
17368 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17369 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17370 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17371 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17372 DAG.getUNDEF(MVT::v2f32)));
17378 assert(!VT.isVector());
17380 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17381 IsSigned, /*IsReplace=*/ false);
17382 SDValue FIST = Vals.first, StackSlot = Vals.second;
17383 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17384 if (!FIST.getNode())
17387 if (StackSlot.getNode())
17388 // Load the result.
17389 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17391 // The node is the result.
17395 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17397 MVT VT = Op.getSimpleValueType();
17398 SDValue In = Op.getOperand(0);
17399 MVT SVT = In.getSimpleValueType();
17401 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17403 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17404 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17405 In, DAG.getUNDEF(SVT)));
17408 /// The only differences between FABS and FNEG are the mask and the logic op.
17409 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17410 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17411 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17412 "Wrong opcode for lowering FABS or FNEG.");
17414 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17416 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17417 // into an FNABS. We'll lower the FABS after that if it is still in use.
17419 for (SDNode *User : Op->uses())
17420 if (User->getOpcode() == ISD::FNEG)
17424 MVT VT = Op.getSimpleValueType();
17426 bool IsF128 = (VT == MVT::f128);
17428 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17429 // decide if we should generate a 16-byte constant mask when we only need 4 or
17430 // 8 bytes for the scalar case.
17435 if (VT.isVector()) {
17437 EltVT = VT.getVectorElementType();
17438 } else if (IsF128) {
17439 // SSE instructions are used for optimized f128 logical operations.
17440 LogicVT = MVT::f128;
17443 // There are no scalar bitwise logical SSE/AVX instructions, so we
17444 // generate a 16-byte vector constant and logic op even for the scalar case.
17445 // Using a 16-byte mask allows folding the load of the mask with
17446 // the logic op, so it can save (~4 bytes) on code size.
17447 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17451 unsigned EltBits = EltVT.getSizeInBits();
17452 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17454 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17455 const fltSemantics &Sem =
17456 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17457 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17458 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17460 SDValue Op0 = Op.getOperand(0);
17461 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17463 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17464 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17466 if (VT.isVector() || IsF128)
17467 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17469 // For the scalar case extend to a 128-bit vector, perform the logic op,
17470 // and extract the scalar result back out.
17471 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17472 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17473 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17474 DAG.getIntPtrConstant(0, dl));
17477 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17478 SDValue Mag = Op.getOperand(0);
17479 SDValue Sign = Op.getOperand(1);
17482 // If the sign operand is smaller, extend it first.
17483 MVT VT = Op.getSimpleValueType();
17484 if (Sign.getSimpleValueType().bitsLT(VT))
17485 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17487 // And if it is bigger, shrink it first.
17488 if (Sign.getSimpleValueType().bitsGT(VT))
17489 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17491 // At this point the operands and the result should have the same
17492 // type, and that won't be f80 since that is not custom lowered.
17493 bool IsF128 = (VT == MVT::f128);
17494 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17495 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17496 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17497 "Unexpected type in LowerFCOPYSIGN");
17499 MVT EltVT = VT.getScalarType();
17500 const fltSemantics &Sem =
17501 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17502 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17504 // Perform all scalar logic operations as 16-byte vectors because there are no
17505 // scalar FP logic instructions in SSE.
17506 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17507 // unnecessary splats, but we might miss load folding opportunities. Should
17508 // this decision be based on OptimizeForSize?
17509 bool IsFakeVector = !VT.isVector() && !IsF128;
17512 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17514 // The mask constants are automatically splatted for vector types.
17515 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17516 SDValue SignMask = DAG.getConstantFP(
17517 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17518 SDValue MagMask = DAG.getConstantFP(
17519 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17521 // First, clear all bits but the sign bit from the second operand (sign).
17523 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17524 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17526 // Next, clear the sign bit from the first operand (magnitude).
17527 // TODO: If we had general constant folding for FP logic ops, this check
17528 // wouldn't be necessary.
17530 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17531 APFloat APF = Op0CN->getValueAPF();
17533 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17535 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17537 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17538 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17541 // OR the magnitude value with the sign bit.
17542 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17543 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17544 DAG.getIntPtrConstant(0, dl));
17547 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17548 SDValue N0 = Op.getOperand(0);
17550 MVT VT = Op.getSimpleValueType();
17552 MVT OpVT = N0.getSimpleValueType();
17553 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17554 "Unexpected type for FGETSIGN");
17556 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17557 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17558 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17559 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17560 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17561 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17565 /// Helper for creating a X86ISD::SETCC node.
17566 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17567 SelectionDAG &DAG) {
17568 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17569 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17572 // Check whether an OR'd tree is PTEST-able.
17573 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17574 const X86Subtarget &Subtarget,
17575 SelectionDAG &DAG) {
17576 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17578 if (!Subtarget.hasSSE41())
17581 if (!Op->hasOneUse())
17584 SDNode *N = Op.getNode();
17587 SmallVector<SDValue, 8> Opnds;
17588 DenseMap<SDValue, unsigned> VecInMap;
17589 SmallVector<SDValue, 8> VecIns;
17590 EVT VT = MVT::Other;
17592 // Recognize a special case where a vector is casted into wide integer to
17594 Opnds.push_back(N->getOperand(0));
17595 Opnds.push_back(N->getOperand(1));
17597 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17598 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17599 // BFS traverse all OR'd operands.
17600 if (I->getOpcode() == ISD::OR) {
17601 Opnds.push_back(I->getOperand(0));
17602 Opnds.push_back(I->getOperand(1));
17603 // Re-evaluate the number of nodes to be traversed.
17604 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17608 // Quit if a non-EXTRACT_VECTOR_ELT
17609 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17612 // Quit if without a constant index.
17613 SDValue Idx = I->getOperand(1);
17614 if (!isa<ConstantSDNode>(Idx))
17617 SDValue ExtractedFromVec = I->getOperand(0);
17618 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17619 if (M == VecInMap.end()) {
17620 VT = ExtractedFromVec.getValueType();
17621 // Quit if not 128/256-bit vector.
17622 if (!VT.is128BitVector() && !VT.is256BitVector())
17624 // Quit if not the same type.
17625 if (VecInMap.begin() != VecInMap.end() &&
17626 VT != VecInMap.begin()->first.getValueType())
17628 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17629 VecIns.push_back(ExtractedFromVec);
17631 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17634 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17635 "Not extracted from 128-/256-bit vector.");
17637 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17639 for (DenseMap<SDValue, unsigned>::const_iterator
17640 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17641 // Quit if not all elements are used.
17642 if (I->second != FullMask)
17646 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17648 // Cast all vectors into TestVT for PTEST.
17649 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17650 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17652 // If more than one full vector is evaluated, OR them first before PTEST.
17653 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17654 // Each iteration will OR 2 nodes and append the result until there is only
17655 // 1 node left, i.e. the final OR'd value of all vectors.
17656 SDValue LHS = VecIns[Slot];
17657 SDValue RHS = VecIns[Slot + 1];
17658 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17661 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17662 VecIns.back(), VecIns.back());
17663 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17666 /// return true if \c Op has a use that doesn't just read flags.
17667 static bool hasNonFlagsUse(SDValue Op) {
17668 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17670 SDNode *User = *UI;
17671 unsigned UOpNo = UI.getOperandNo();
17672 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17673 // Look pass truncate.
17674 UOpNo = User->use_begin().getOperandNo();
17675 User = *User->use_begin();
17678 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17679 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17685 /// Emit nodes that will be selected as "test Op0,Op0", or something
17687 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17688 SelectionDAG &DAG) const {
17689 // CF and OF aren't always set the way we want. Determine which
17690 // of these we need.
17691 bool NeedCF = false;
17692 bool NeedOF = false;
17695 case X86::COND_A: case X86::COND_AE:
17696 case X86::COND_B: case X86::COND_BE:
17699 case X86::COND_G: case X86::COND_GE:
17700 case X86::COND_L: case X86::COND_LE:
17701 case X86::COND_O: case X86::COND_NO: {
17702 // Check if we really need to set the
17703 // Overflow flag. If NoSignedWrap is present
17704 // that is not actually needed.
17705 switch (Op->getOpcode()) {
17710 if (Op.getNode()->getFlags().hasNoSignedWrap())
17720 // See if we can use the EFLAGS value from the operand instead of
17721 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17722 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17723 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17724 // Emit a CMP with 0, which is the TEST pattern.
17725 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17726 DAG.getConstant(0, dl, Op.getValueType()));
17728 unsigned Opcode = 0;
17729 unsigned NumOperands = 0;
17731 // Truncate operations may prevent the merge of the SETCC instruction
17732 // and the arithmetic instruction before it. Attempt to truncate the operands
17733 // of the arithmetic instruction and use a reduced bit-width instruction.
17734 bool NeedTruncation = false;
17735 SDValue ArithOp = Op;
17736 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17737 SDValue Arith = Op->getOperand(0);
17738 // Both the trunc and the arithmetic op need to have one user each.
17739 if (Arith->hasOneUse())
17740 switch (Arith.getOpcode()) {
17747 NeedTruncation = true;
17753 // Sometimes flags can be set either with an AND or with an SRL/SHL
17754 // instruction. SRL/SHL variant should be preferred for masks longer than this
17756 const int ShiftToAndMaxMaskWidth = 32;
17757 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17759 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17760 // which may be the result of a CAST. We use the variable 'Op', which is the
17761 // non-casted variable when we check for possible users.
17762 switch (ArithOp.getOpcode()) {
17764 // We only want to rewrite this as a target-specific node with attached
17765 // flags if there is a reasonable chance of either using that to do custom
17766 // instructions selection that can fold some of the memory operands, or if
17767 // only the flags are used. If there are other uses, leave the node alone
17768 // and emit a test instruction.
17769 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17770 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17771 if (UI->getOpcode() != ISD::CopyToReg &&
17772 UI->getOpcode() != ISD::SETCC &&
17773 UI->getOpcode() != ISD::STORE)
17776 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17777 // An add of one will be selected as an INC.
17779 (!Subtarget.slowIncDec() ||
17780 DAG.getMachineFunction().getFunction().optForSize())) {
17781 Opcode = X86ISD::INC;
17786 // An add of negative one (subtract of one) will be selected as a DEC.
17787 if (C->isAllOnesValue() &&
17788 (!Subtarget.slowIncDec() ||
17789 DAG.getMachineFunction().getFunction().optForSize())) {
17790 Opcode = X86ISD::DEC;
17796 // Otherwise use a regular EFLAGS-setting add.
17797 Opcode = X86ISD::ADD;
17802 // If we have a constant logical shift that's only used in a comparison
17803 // against zero turn it into an equivalent AND. This allows turning it into
17804 // a TEST instruction later.
17805 if (ZeroCheck && Op->hasOneUse() &&
17806 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17807 EVT VT = Op.getValueType();
17808 unsigned BitWidth = VT.getSizeInBits();
17809 unsigned ShAmt = Op->getConstantOperandVal(1);
17810 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17812 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17813 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17814 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17815 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17817 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17818 DAG.getConstant(Mask, dl, VT));
17823 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17824 // because a TEST instruction will be better. However, AND should be
17825 // preferred if the instruction can be combined into ANDN.
17826 if (!hasNonFlagsUse(Op)) {
17827 SDValue Op0 = ArithOp->getOperand(0);
17828 SDValue Op1 = ArithOp->getOperand(1);
17829 EVT VT = ArithOp.getValueType();
17830 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17831 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17832 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17834 // If we cannot select an ANDN instruction, check if we can replace
17835 // AND+IMM64 with a shift before giving up. This is possible for masks
17836 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17837 if (!isProperAndn) {
17841 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17842 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17846 const APInt &Mask = CN->getAPIntValue();
17847 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17848 break; // Prefer TEST instruction.
17850 unsigned BitWidth = Mask.getBitWidth();
17851 unsigned LeadingOnes = Mask.countLeadingOnes();
17852 unsigned TrailingZeros = Mask.countTrailingZeros();
17854 if (LeadingOnes + TrailingZeros == BitWidth) {
17855 assert(TrailingZeros < VT.getSizeInBits() &&
17856 "Shift amount should be less than the type width");
17857 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17858 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17859 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17863 unsigned LeadingZeros = Mask.countLeadingZeros();
17864 unsigned TrailingOnes = Mask.countTrailingOnes();
17866 if (LeadingZeros + TrailingOnes == BitWidth) {
17867 assert(LeadingZeros < VT.getSizeInBits() &&
17868 "Shift amount should be less than the type width");
17869 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17870 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17871 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17882 // Similar to ISD::ADD above, check if the uses will preclude useful
17883 // lowering of the target-specific node.
17884 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17885 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17886 if (UI->getOpcode() != ISD::CopyToReg &&
17887 UI->getOpcode() != ISD::SETCC &&
17888 UI->getOpcode() != ISD::STORE)
17891 // Otherwise use a regular EFLAGS-setting instruction.
17892 switch (ArithOp.getOpcode()) {
17893 default: llvm_unreachable("unexpected operator!");
17894 case ISD::SUB: Opcode = X86ISD::SUB; break;
17895 case ISD::XOR: Opcode = X86ISD::XOR; break;
17896 case ISD::AND: Opcode = X86ISD::AND; break;
17897 case ISD::OR: Opcode = X86ISD::OR; break;
17909 return SDValue(Op.getNode(), 1);
17915 // If we found that truncation is beneficial, perform the truncation and
17917 if (NeedTruncation) {
17918 EVT VT = Op.getValueType();
17919 SDValue WideVal = Op->getOperand(0);
17920 EVT WideVT = WideVal.getValueType();
17921 unsigned ConvertedOp = 0;
17922 // Use a target machine opcode to prevent further DAGCombine
17923 // optimizations that may separate the arithmetic operations
17924 // from the setcc node.
17925 switch (WideVal.getOpcode()) {
17927 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17928 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17929 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17930 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17931 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17935 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17936 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17937 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17938 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17939 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17940 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17946 // Emit a CMP with 0, which is the TEST pattern.
17947 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17948 DAG.getConstant(0, dl, Op.getValueType()));
17950 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17951 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17953 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17954 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17955 return SDValue(New.getNode(), 1);
17958 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17960 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17961 const SDLoc &dl, SelectionDAG &DAG) const {
17962 if (isNullConstant(Op1))
17963 return EmitTest(Op0, X86CC, dl, DAG);
17965 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17966 "Unexpected comparison operation for MVT::i1 operands");
17968 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17969 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17970 // Only promote the compare up to I32 if it is a 16 bit operation
17971 // with an immediate. 16 bit immediates are to be avoided.
17972 if ((Op0.getValueType() == MVT::i16 &&
17973 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17974 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17975 !Subtarget.isAtom()) {
17976 unsigned ExtendOp =
17977 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17978 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17979 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17981 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17982 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17983 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17984 return SDValue(Sub.getNode(), 1);
17986 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17989 /// Convert a comparison if required by the subtarget.
17990 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17991 SelectionDAG &DAG) const {
17992 // If the subtarget does not support the FUCOMI instruction, floating-point
17993 // comparisons have to be converted.
17994 if (Subtarget.hasCMov() ||
17995 Cmp.getOpcode() != X86ISD::CMP ||
17996 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17997 !Cmp.getOperand(1).getValueType().isFloatingPoint())
18000 // The instruction selector will select an FUCOM instruction instead of
18001 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
18002 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
18003 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
18005 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
18006 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
18007 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
18008 DAG.getConstant(8, dl, MVT::i8));
18009 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
18011 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
18012 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
18013 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
18016 /// Check if replacement of SQRT with RSQRT should be disabled.
18017 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
18018 EVT VT = Op.getValueType();
18020 // We never want to use both SQRT and RSQRT instructions for the same input.
18021 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
18025 return Subtarget.hasFastVectorFSQRT();
18026 return Subtarget.hasFastScalarFSQRT();
18029 /// The minimum architected relative accuracy is 2^-12. We need one
18030 /// Newton-Raphson step to have a good float result (24 bits of precision).
18031 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
18032 SelectionDAG &DAG, int Enabled,
18033 int &RefinementSteps,
18034 bool &UseOneConstNR,
18035 bool Reciprocal) const {
18036 EVT VT = Op.getValueType();
18038 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
18039 // It is likely not profitable to do this for f64 because a double-precision
18040 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
18041 // instructions: convert to single, rsqrtss, convert back to double, refine
18042 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
18043 // along with FMA, this could be a throughput win.
18044 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
18045 // after legalize types.
18046 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18047 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
18048 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
18049 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18050 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18051 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18052 RefinementSteps = 1;
18054 UseOneConstNR = false;
18055 // There is no FSQRT for 512-bits, but there is RSQRT14.
18056 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
18057 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18062 /// The minimum architected relative accuracy is 2^-12. We need one
18063 /// Newton-Raphson step to have a good float result (24 bits of precision).
18064 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
18066 int &RefinementSteps) const {
18067 EVT VT = Op.getValueType();
18069 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
18070 // It is likely not profitable to do this for f64 because a double-precision
18071 // reciprocal estimate with refinement on x86 prior to FMA requires
18072 // 15 instructions: convert to single, rcpss, convert back to double, refine
18073 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
18074 // along with FMA, this could be a throughput win.
18076 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18077 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
18078 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18079 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18080 // Enable estimate codegen with 1 refinement step for vector division.
18081 // Scalar division estimates are disabled because they break too much
18082 // real-world code. These defaults are intended to match GCC behavior.
18083 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
18086 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18087 RefinementSteps = 1;
18089 // There is no FSQRT for 512-bits, but there is RCP14.
18090 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
18091 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18096 /// If we have at least two divisions that use the same divisor, convert to
18097 /// multiplication by a reciprocal. This may need to be adjusted for a given
18098 /// CPU if a division's cost is not at least twice the cost of a multiplication.
18099 /// This is because we still need one division to calculate the reciprocal and
18100 /// then we need two multiplies by that reciprocal as replacements for the
18101 /// original divisions.
18102 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
18106 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
18107 /// according to equal/not-equal condition code \p CC.
18108 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
18109 const SDLoc &dl, SelectionDAG &DAG) {
18110 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
18111 // instruction. Since the shift amount is in-range-or-undefined, we know
18112 // that doing a bittest on the i32 value is ok. We extend to i32 because
18113 // the encoding for the i16 version is larger than the i32 version.
18114 // Also promote i16 to i32 for performance / code size reason.
18115 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
18116 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
18118 // See if we can use the 32-bit instruction instead of the 64-bit one for a
18119 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
18120 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
18121 // known to be zero.
18122 if (Src.getValueType() == MVT::i64 &&
18123 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
18124 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
18126 // If the operand types disagree, extend the shift amount to match. Since
18127 // BT ignores high bits (like shifts) we can use anyextend.
18128 if (Src.getValueType() != BitNo.getValueType())
18129 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
18131 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
18132 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
18133 return getSETCC(Cond, BT, dl , DAG);
18136 /// Result of 'and' is compared against zero. Change to a BT node if possible.
18137 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
18138 const SDLoc &dl, SelectionDAG &DAG) {
18139 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
18140 SDValue Op0 = And.getOperand(0);
18141 SDValue Op1 = And.getOperand(1);
18142 if (Op0.getOpcode() == ISD::TRUNCATE)
18143 Op0 = Op0.getOperand(0);
18144 if (Op1.getOpcode() == ISD::TRUNCATE)
18145 Op1 = Op1.getOperand(0);
18148 if (Op1.getOpcode() == ISD::SHL)
18149 std::swap(Op0, Op1);
18150 if (Op0.getOpcode() == ISD::SHL) {
18151 if (isOneConstant(Op0.getOperand(0))) {
18152 // If we looked past a truncate, check that it's only truncating away
18154 unsigned BitWidth = Op0.getValueSizeInBits();
18155 unsigned AndBitWidth = And.getValueSizeInBits();
18156 if (BitWidth > AndBitWidth) {
18158 DAG.computeKnownBits(Op0, Known);
18159 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
18163 RHS = Op0.getOperand(1);
18165 } else if (Op1.getOpcode() == ISD::Constant) {
18166 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
18167 uint64_t AndRHSVal = AndRHS->getZExtValue();
18168 SDValue AndLHS = Op0;
18170 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
18171 LHS = AndLHS.getOperand(0);
18172 RHS = AndLHS.getOperand(1);
18174 // Use BT if the immediate can't be encoded in a TEST instruction or we
18175 // are optimizing for size and the immedaite won't fit in a byte.
18176 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
18177 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
18178 isPowerOf2_64(AndRHSVal)) {
18180 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
18186 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
18191 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
18193 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
18198 // SSE Condition code mapping:
18207 switch (SetCCOpcode) {
18208 default: llvm_unreachable("Unexpected SETCC condition");
18210 case ISD::SETEQ: SSECC = 0; break;
18212 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
18214 case ISD::SETOLT: SSECC = 1; break;
18216 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
18218 case ISD::SETOLE: SSECC = 2; break;
18219 case ISD::SETUO: SSECC = 3; break;
18221 case ISD::SETNE: SSECC = 4; break;
18222 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
18223 case ISD::SETUGE: SSECC = 5; break;
18224 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
18225 case ISD::SETUGT: SSECC = 6; break;
18226 case ISD::SETO: SSECC = 7; break;
18227 case ISD::SETUEQ: SSECC = 8; break;
18228 case ISD::SETONE: SSECC = 12; break;
18231 std::swap(Op0, Op1);
18236 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
18237 /// concatenate the result back.
18238 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
18239 MVT VT = Op.getSimpleValueType();
18241 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
18242 "Unsupported value type for operation");
18244 unsigned NumElems = VT.getVectorNumElements();
18246 SDValue CC = Op.getOperand(2);
18248 // Extract the LHS vectors
18249 SDValue LHS = Op.getOperand(0);
18250 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
18251 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
18253 // Extract the RHS vectors
18254 SDValue RHS = Op.getOperand(1);
18255 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
18256 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
18258 // Issue the operation on the smaller types and concatenate the result back
18259 MVT EltVT = VT.getVectorElementType();
18260 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18261 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18262 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
18263 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18266 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18268 SDValue Op0 = Op.getOperand(0);
18269 SDValue Op1 = Op.getOperand(1);
18270 SDValue CC = Op.getOperand(2);
18271 MVT VT = Op.getSimpleValueType();
18274 assert(VT.getVectorElementType() == MVT::i1 &&
18275 "Cannot set masked compare for this operation");
18277 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18279 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18280 // This helps with vptestm matching.
18281 // TODO: Should we just canonicalize the setcc during DAG combine?
18282 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18283 ISD::isBuildVectorAllZeros(Op0.getNode()))
18284 std::swap(Op0, Op1);
18286 // Prefer SETGT over SETLT.
18287 if (SetCCOpcode == ISD::SETLT) {
18288 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
18289 std::swap(Op0, Op1);
18292 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
18295 /// Try to turn a VSETULT into a VSETULE by modifying its second
18296 /// operand \p Op1. If non-trivial (for example because it's not constant)
18297 /// return an empty value.
18298 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18299 SelectionDAG &DAG) {
18300 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18304 MVT VT = Op1.getSimpleValueType();
18305 MVT EVT = VT.getVectorElementType();
18306 unsigned n = VT.getVectorNumElements();
18307 SmallVector<SDValue, 8> ULTOp1;
18309 for (unsigned i = 0; i < n; ++i) {
18310 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18311 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18314 // Avoid underflow.
18315 APInt Val = Elt->getAPIntValue();
18319 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18322 return DAG.getBuildVector(VT, dl, ULTOp1);
18325 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18327 /// t = psubus Op0, Op1
18328 /// pcmpeq t, <0..0>
18329 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18330 ISD::CondCode Cond, const SDLoc &dl,
18331 const X86Subtarget &Subtarget,
18332 SelectionDAG &DAG) {
18333 if (!Subtarget.hasSSE2())
18336 MVT VET = VT.getVectorElementType();
18337 if (VET != MVT::i8 && VET != MVT::i16)
18343 case ISD::SETULT: {
18344 // If the comparison is against a constant we can turn this into a
18345 // setule. With psubus, setule does not require a swap. This is
18346 // beneficial because the constant in the register is no longer
18347 // destructed as the destination so it can be hoisted out of a loop.
18348 // Only do this pre-AVX since vpcmp* is no longer destructive.
18349 if (Subtarget.hasAVX())
18351 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18357 // Psubus is better than flip-sign because it requires no inversion.
18359 std::swap(Op0, Op1);
18365 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18366 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18367 getZeroVector(VT, Subtarget, DAG, dl));
18370 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18371 SelectionDAG &DAG) {
18372 SDValue Op0 = Op.getOperand(0);
18373 SDValue Op1 = Op.getOperand(1);
18374 SDValue CC = Op.getOperand(2);
18375 MVT VT = Op.getSimpleValueType();
18376 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18377 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18382 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18383 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18387 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18388 assert(VT.getVectorNumElements() <= 16);
18389 Opc = X86ISD::CMPM;
18391 Opc = X86ISD::CMPP;
18392 // The SSE/AVX packed FP comparison nodes are defined with a
18393 // floating-point vector result that matches the operand type. This allows
18394 // them to work with an SSE1 target (integer vector types are not legal).
18395 VT = Op0.getSimpleValueType();
18398 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18399 // emit two comparisons and a logic op to tie them together.
18401 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18402 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18403 // LLVM predicate is SETUEQ or SETONE.
18405 unsigned CombineOpc;
18406 if (Cond == ISD::SETUEQ) {
18409 CombineOpc = X86ISD::FOR;
18411 assert(Cond == ISD::SETONE);
18414 CombineOpc = X86ISD::FAND;
18417 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18418 DAG.getConstant(CC0, dl, MVT::i8));
18419 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18420 DAG.getConstant(CC1, dl, MVT::i8));
18421 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18423 // Handle all other FP comparisons here.
18424 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18425 DAG.getConstant(SSECC, dl, MVT::i8));
18428 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18429 // result type of SETCC. The bitcast is expected to be optimized away
18430 // during combining/isel.
18431 if (Opc == X86ISD::CMPP)
18432 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18437 MVT VTOp0 = Op0.getSimpleValueType();
18438 assert(VTOp0 == Op1.getSimpleValueType() &&
18439 "Expected operands with same type!");
18440 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18441 "Invalid number of packed elements for source and destination!");
18443 // This is being called by type legalization because v2i32 is marked custom
18444 // for result type legalization for v2f32.
18445 if (VTOp0 == MVT::v2i32)
18448 // The non-AVX512 code below works under the assumption that source and
18449 // destination types are the same.
18450 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18451 "Value types for source and destination must be the same!");
18453 // Break 256-bit integer vector compare into smaller ones.
18454 if (VT.is256BitVector() && !Subtarget.hasInt256())
18455 return Lower256IntVSETCC(Op, DAG);
18457 // The result is boolean, but operands are int/float
18458 if (VT.getVectorElementType() == MVT::i1) {
18459 // In AVX-512 architecture setcc returns mask with i1 elements,
18460 // But there is no compare instruction for i8 and i16 elements in KNL.
18461 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18462 "Unexpected operand type");
18463 return LowerIntVSETCC_AVX512(Op, DAG);
18466 // Lower using XOP integer comparisons.
18467 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18468 // Translate compare code to XOP PCOM compare mode.
18469 unsigned CmpMode = 0;
18471 default: llvm_unreachable("Unexpected SETCC condition");
18473 case ISD::SETLT: CmpMode = 0x00; break;
18475 case ISD::SETLE: CmpMode = 0x01; break;
18477 case ISD::SETGT: CmpMode = 0x02; break;
18479 case ISD::SETGE: CmpMode = 0x03; break;
18480 case ISD::SETEQ: CmpMode = 0x04; break;
18481 case ISD::SETNE: CmpMode = 0x05; break;
18484 // Are we comparing unsigned or signed integers?
18486 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18488 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18489 DAG.getConstant(CmpMode, dl, MVT::i8));
18492 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18493 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18494 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18495 SDValue BC0 = peekThroughBitcasts(Op0);
18496 if (BC0.getOpcode() == ISD::AND) {
18498 SmallVector<APInt, 64> EltBits;
18499 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18500 VT.getScalarSizeInBits(), UndefElts,
18501 EltBits, false, false)) {
18502 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18504 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18510 // If this is a SETNE against the signed minimum value, change it to SETGT.
18511 // If this is a SETNE against the signed maximum value, change it to SETLT.
18512 // which will be swapped to SETGT.
18513 // Otherwise we use PCMPEQ+invert.
18515 if (Cond == ISD::SETNE &&
18516 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18517 if (ConstValue.isMinSignedValue())
18519 else if (ConstValue.isMaxSignedValue())
18523 // If both operands are known non-negative, then an unsigned compare is the
18524 // same as a signed compare and there's no need to flip signbits.
18525 // TODO: We could check for more general simplifications here since we're
18526 // computing known bits.
18527 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18528 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18530 // Special case: Use min/max operations for unsigned compares. We only want
18531 // to do this for unsigned compares if we need to flip signs or if it allows
18532 // use to avoid an invert.
18533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18534 if (ISD::isUnsignedIntSetCC(Cond) &&
18535 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18536 TLI.isOperationLegal(ISD::UMIN, VT)) {
18537 bool Invert = false;
18540 default: llvm_unreachable("Unexpected condition code");
18541 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18542 case ISD::SETULE: Opc = ISD::UMIN; break;
18543 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18544 case ISD::SETUGE: Opc = ISD::UMAX; break;
18547 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18548 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18550 // If the logical-not of the result is required, perform that now.
18552 Result = DAG.getNOT(dl, Result, VT);
18557 // Try to use SUBUS and PCMPEQ.
18558 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18561 // We are handling one of the integer comparisons here. Since SSE only has
18562 // GT and EQ comparisons for integer, swapping operands and multiple
18563 // operations may be required for some comparisons.
18564 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18566 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18567 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18568 bool Invert = Cond == ISD::SETNE ||
18569 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18572 std::swap(Op0, Op1);
18574 // Check that the operation in question is available (most are plain SSE2,
18575 // but PCMPGTQ and PCMPEQQ have different requirements).
18576 if (VT == MVT::v2i64) {
18577 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18578 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18580 // First cast everything to the right type.
18581 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18582 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18584 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18585 // bits of the inputs before performing those operations. The lower
18586 // compare is always unsigned.
18589 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18591 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18592 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18593 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18595 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18596 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18598 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18599 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18600 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18602 // Create masks for only the low parts/high parts of the 64 bit integers.
18603 static const int MaskHi[] = { 1, 1, 3, 3 };
18604 static const int MaskLo[] = { 0, 0, 2, 2 };
18605 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18606 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18607 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18609 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18610 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18613 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18615 return DAG.getBitcast(VT, Result);
18618 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18619 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18620 // pcmpeqd + pshufd + pand.
18621 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18623 // First cast everything to the right type.
18624 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18625 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18628 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18630 // Make sure the lower and upper halves are both all-ones.
18631 static const int Mask[] = { 1, 0, 3, 2 };
18632 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18633 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18636 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18638 return DAG.getBitcast(VT, Result);
18642 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18643 // bits of the inputs before performing those operations.
18645 MVT EltVT = VT.getVectorElementType();
18646 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18648 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18649 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18652 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18654 // If the logical-not of the result is required, perform that now.
18656 Result = DAG.getNOT(dl, Result, VT);
18661 // Try to select this as a KTEST+SETCC if possible.
18662 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18663 const SDLoc &dl, SelectionDAG &DAG,
18664 const X86Subtarget &Subtarget) {
18665 // Only support equality comparisons.
18666 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18669 // Must be a bitcast from vXi1.
18670 if (Op0.getOpcode() != ISD::BITCAST)
18673 Op0 = Op0.getOperand(0);
18674 MVT VT = Op0.getSimpleValueType();
18675 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18676 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18677 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18680 X86::CondCode X86CC;
18681 if (isNullConstant(Op1)) {
18682 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18683 } else if (isAllOnesConstant(Op1)) {
18684 // C flag is set for all ones.
18685 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18689 // If the input is an OR, we can combine it's operands into the KORTEST.
18692 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18693 LHS = Op0.getOperand(0);
18694 RHS = Op0.getOperand(1);
18697 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18698 return getSETCC(X86CC, KORTEST, dl, DAG);
18701 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18703 MVT VT = Op.getSimpleValueType();
18705 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18707 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18708 SDValue Op0 = Op.getOperand(0);
18709 SDValue Op1 = Op.getOperand(1);
18711 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18713 // Optimize to BT if possible.
18714 // Lower (X & (1 << N)) == 0 to BT(X, N).
18715 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18716 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18717 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18718 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18719 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18723 // Try to use PTEST for a tree ORs equality compared with 0.
18724 // TODO: We could do AND tree with all 1s as well by using the C flag.
18725 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18726 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18727 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18731 // Try to lower using KTEST.
18732 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18735 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18737 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18738 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18740 // If the input is a setcc, then reuse the input setcc or use a new one with
18741 // the inverted condition.
18742 if (Op0.getOpcode() == X86ISD::SETCC) {
18743 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18744 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18748 CCode = X86::GetOppositeBranchCondition(CCode);
18749 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18753 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18754 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18755 if (X86CC == X86::COND_INVALID)
18758 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18759 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18760 return getSETCC(X86CC, EFLAGS, dl, DAG);
18763 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18764 SDValue LHS = Op.getOperand(0);
18765 SDValue RHS = Op.getOperand(1);
18766 SDValue Carry = Op.getOperand(2);
18767 SDValue Cond = Op.getOperand(3);
18770 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18771 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18773 // Recreate the carry if needed.
18774 EVT CarryVT = Carry.getValueType();
18775 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18776 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18777 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18779 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18780 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18781 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18784 /// Return true if opcode is a X86 logical comparison.
18785 static bool isX86LogicalCmp(SDValue Op) {
18786 unsigned Opc = Op.getOpcode();
18787 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18788 Opc == X86ISD::SAHF)
18790 if (Op.getResNo() == 1 &&
18791 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18792 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18793 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18794 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18797 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18803 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18804 if (V.getOpcode() != ISD::TRUNCATE)
18807 SDValue VOp0 = V.getOperand(0);
18808 unsigned InBits = VOp0.getValueSizeInBits();
18809 unsigned Bits = V.getValueSizeInBits();
18810 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18813 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18814 bool AddTest = true;
18815 SDValue Cond = Op.getOperand(0);
18816 SDValue Op1 = Op.getOperand(1);
18817 SDValue Op2 = Op.getOperand(2);
18819 MVT VT = Op1.getSimpleValueType();
18822 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18823 // are available or VBLENDV if AVX is available.
18824 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18825 if (Cond.getOpcode() == ISD::SETCC &&
18826 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18827 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18828 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18829 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18830 unsigned SSECC = translateX86FSETCC(
18831 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18833 if (Subtarget.hasAVX512()) {
18834 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18835 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18836 assert(!VT.isVector() && "Not a scalar type?");
18837 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18840 if (SSECC < 8 || Subtarget.hasAVX()) {
18841 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18842 DAG.getConstant(SSECC, DL, MVT::i8));
18844 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18845 // of 3 logic instructions for size savings and potentially speed.
18846 // Unfortunately, there is no scalar form of VBLENDV.
18848 // If either operand is a constant, don't try this. We can expect to
18849 // optimize away at least one of the logic instructions later in that
18850 // case, so that sequence would be faster than a variable blend.
18852 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18853 // uses XMM0 as the selection register. That may need just as many
18854 // instructions as the AND/ANDN/OR sequence due to register moves, so
18857 if (Subtarget.hasAVX() &&
18858 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18860 // Convert to vectors, do a VSELECT, and convert back to scalar.
18861 // All of the conversions should be optimized away.
18863 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18864 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18865 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18866 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18868 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18869 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18871 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18873 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18874 VSel, DAG.getIntPtrConstant(0, DL));
18876 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18877 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18878 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18882 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18883 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18884 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18885 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18888 // For v64i1 without 64-bit support we need to split and rejoin.
18889 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18890 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18891 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18892 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18893 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18894 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18895 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18896 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18897 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18900 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18902 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18903 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18904 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18905 Op1Scalar = Op1.getOperand(0);
18907 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18908 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18909 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18910 Op2Scalar = Op2.getOperand(0);
18911 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18912 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18913 Op1Scalar, Op2Scalar);
18914 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18915 return DAG.getBitcast(VT, newSelect);
18916 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18918 DAG.getIntPtrConstant(0, DL));
18922 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18923 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18924 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18925 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18926 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18927 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18928 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18929 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18932 if (Cond.getOpcode() == ISD::SETCC) {
18933 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18935 // If the condition was updated, it's possible that the operands of the
18936 // select were also updated (for example, EmitTest has a RAUW). Refresh
18937 // the local references to the select operands in case they got stale.
18938 Op1 = Op.getOperand(1);
18939 Op2 = Op.getOperand(2);
18943 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18944 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18945 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18946 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18947 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18948 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18949 if (Cond.getOpcode() == X86ISD::SETCC &&
18950 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18951 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18952 SDValue Cmp = Cond.getOperand(1);
18953 unsigned CondCode =
18954 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18956 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18957 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18958 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18959 SDValue CmpOp0 = Cmp.getOperand(0);
18961 // Apply further optimizations for special cases
18962 // (select (x != 0), -1, 0) -> neg & sbb
18963 // (select (x == 0), 0, -1) -> neg & sbb
18964 if (isNullConstant(Y) &&
18965 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18966 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18967 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18968 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18969 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18970 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18971 SDValue(Neg.getNode(), 1));
18975 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18976 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18977 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18979 SDValue Res = // Res = 0 or -1.
18980 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18981 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18983 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18984 Res = DAG.getNOT(DL, Res, Res.getValueType());
18986 if (!isNullConstant(Op2))
18987 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18989 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18990 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18991 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18992 SDValue CmpOp0 = Cmp.getOperand(0);
18993 SDValue Src1, Src2;
18994 // true if Op2 is XOR or OR operator and one of its operands
18996 // ( a , a op b) || ( b , a op b)
18997 auto isOrXorPattern = [&]() {
18998 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18999 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
19001 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
19008 if (isOrXorPattern()) {
19010 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
19011 // we need mask of all zeros or ones with same size of the other
19013 if (CmpSz > VT.getSizeInBits())
19014 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
19015 else if (CmpSz < VT.getSizeInBits())
19016 Neg = DAG.getNode(ISD::AND, DL, VT,
19017 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
19018 DAG.getConstant(1, DL, VT));
19021 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
19022 Neg); // -(and (x, 0x1))
19023 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
19024 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
19029 // Look past (and (setcc_carry (cmp ...)), 1).
19030 if (Cond.getOpcode() == ISD::AND &&
19031 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19032 isOneConstant(Cond.getOperand(1)))
19033 Cond = Cond.getOperand(0);
19035 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19036 // setting operand in place of the X86ISD::SETCC.
19037 unsigned CondOpcode = Cond.getOpcode();
19038 if (CondOpcode == X86ISD::SETCC ||
19039 CondOpcode == X86ISD::SETCC_CARRY) {
19040 CC = Cond.getOperand(0);
19042 SDValue Cmp = Cond.getOperand(1);
19043 unsigned Opc = Cmp.getOpcode();
19044 MVT VT = Op.getSimpleValueType();
19046 bool IllegalFPCMov = false;
19047 if (VT.isFloatingPoint() && !VT.isVector() &&
19048 !isScalarFPTypeInSSEReg(VT)) // FPStack?
19049 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
19051 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
19052 Opc == X86ISD::BT) { // FIXME
19056 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19057 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19058 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19059 Cond.getOperand(0).getValueType() != MVT::i8)) {
19060 SDValue LHS = Cond.getOperand(0);
19061 SDValue RHS = Cond.getOperand(1);
19062 unsigned X86Opcode;
19065 switch (CondOpcode) {
19066 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19067 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19068 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19069 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19070 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19071 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19072 default: llvm_unreachable("unexpected overflowing operator");
19074 if (CondOpcode == ISD::UMULO)
19075 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19078 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19080 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
19082 if (CondOpcode == ISD::UMULO)
19083 Cond = X86Op.getValue(2);
19085 Cond = X86Op.getValue(1);
19087 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
19092 // Look past the truncate if the high bits are known zero.
19093 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19094 Cond = Cond.getOperand(0);
19096 // We know the result of AND is compared against zero. Try to match
19098 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19099 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
19100 CC = NewSetCC.getOperand(0);
19101 Cond = NewSetCC.getOperand(1);
19108 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
19109 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
19112 // a < b ? -1 : 0 -> RES = ~setcc_carry
19113 // a < b ? 0 : -1 -> RES = setcc_carry
19114 // a >= b ? -1 : 0 -> RES = setcc_carry
19115 // a >= b ? 0 : -1 -> RES = ~setcc_carry
19116 if (Cond.getOpcode() == X86ISD::SUB) {
19117 Cond = ConvertCmpIfNecessary(Cond, DAG);
19118 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
19120 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
19121 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
19122 (isNullConstant(Op1) || isNullConstant(Op2))) {
19123 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
19124 DAG.getConstant(X86::COND_B, DL, MVT::i8),
19126 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
19127 return DAG.getNOT(DL, Res, Res.getValueType());
19132 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
19133 // widen the cmov and push the truncate through. This avoids introducing a new
19134 // branch during isel and doesn't add any extensions.
19135 if (Op.getValueType() == MVT::i8 &&
19136 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
19137 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
19138 if (T1.getValueType() == T2.getValueType() &&
19139 // Blacklist CopyFromReg to avoid partial register stalls.
19140 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
19141 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
19143 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19147 // Promote i16 cmovs if it won't prevent folding a load.
19148 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
19149 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
19150 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
19151 SDValue Ops[] = { Op2, Op1, CC, Cond };
19152 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
19153 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19156 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
19157 // condition is true.
19158 SDValue Ops[] = { Op2, Op1, CC, Cond };
19159 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
19162 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
19163 const X86Subtarget &Subtarget,
19164 SelectionDAG &DAG) {
19165 MVT VT = Op->getSimpleValueType(0);
19166 SDValue In = Op->getOperand(0);
19167 MVT InVT = In.getSimpleValueType();
19168 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19169 MVT VTElt = VT.getVectorElementType();
19172 unsigned NumElts = VT.getVectorNumElements();
19174 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
19176 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
19177 // If v16i32 is to be avoided, we'll need to split and concatenate.
19178 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19179 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
19181 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19184 // Widen to 512-bits if VLX is not supported.
19185 MVT WideVT = ExtVT;
19186 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19187 NumElts *= 512 / ExtVT.getSizeInBits();
19188 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19189 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
19190 In, DAG.getIntPtrConstant(0, dl));
19191 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
19195 MVT WideEltVT = WideVT.getVectorElementType();
19196 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
19197 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
19198 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
19200 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
19201 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
19202 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
19205 // Truncate if we had to extend i16/i8 above.
19207 WideVT = MVT::getVectorVT(VTElt, NumElts);
19208 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
19211 // Extract back to 128/256-bit if we widened.
19213 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
19214 DAG.getIntPtrConstant(0, dl));
19219 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19220 SelectionDAG &DAG) {
19221 SDValue In = Op->getOperand(0);
19222 MVT InVT = In.getSimpleValueType();
19224 if (InVT.getVectorElementType() == MVT::i1)
19225 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19227 assert(Subtarget.hasAVX() && "Expected AVX support");
19228 return LowerAVXExtend(Op, DAG, Subtarget);
19231 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
19232 // For sign extend this needs to handle all vector sizes and SSE4.1 and
19233 // non-SSE4.1 targets. For zero extend this should only handle inputs of
19234 // MVT::v64i8 when BWI is not supported, but AVX512 is.
19235 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
19236 const X86Subtarget &Subtarget,
19237 SelectionDAG &DAG) {
19238 SDValue In = Op->getOperand(0);
19239 MVT VT = Op->getSimpleValueType(0);
19240 MVT InVT = In.getSimpleValueType();
19241 assert(VT.getSizeInBits() == InVT.getSizeInBits());
19243 MVT SVT = VT.getVectorElementType();
19244 MVT InSVT = InVT.getVectorElementType();
19245 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
19247 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
19249 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19251 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19252 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19253 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19258 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19259 // For 512-bit vectors, we need 128-bits or 256-bits.
19260 if (VT.getSizeInBits() > 128) {
19261 // Input needs to be at least the same number of elements as output, and
19262 // at least 128-bits.
19263 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19264 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19267 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19268 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19270 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19271 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19272 // need to be handled here for 256/512-bit results.
19273 if (Subtarget.hasInt256()) {
19274 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19275 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19276 X86ISD::VSEXT : X86ISD::VZEXT;
19277 return DAG.getNode(ExtOpc, dl, VT, In);
19280 // We should only get here for sign extend.
19281 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19282 "Unexpected opcode!");
19284 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19288 // As SRAI is only available on i16/i32 types, we expand only up to i32
19289 // and handle i64 separately.
19290 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19291 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19292 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19293 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19294 Curr = DAG.getBitcast(CurrVT, Curr);
19297 SDValue SignExt = Curr;
19298 if (CurrVT != InVT) {
19299 unsigned SignExtShift =
19300 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19301 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19302 DAG.getConstant(SignExtShift, dl, MVT::i8));
19308 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19309 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19310 DAG.getConstant(31, dl, MVT::i8));
19311 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19312 return DAG.getBitcast(VT, Ext);
19318 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19319 SelectionDAG &DAG) {
19320 MVT VT = Op->getSimpleValueType(0);
19321 SDValue In = Op->getOperand(0);
19322 MVT InVT = In.getSimpleValueType();
19325 if (InVT.getVectorElementType() == MVT::i1)
19326 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19328 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19329 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19330 "Expected same number of elements");
19331 assert((VT.getVectorElementType() == MVT::i16 ||
19332 VT.getVectorElementType() == MVT::i32 ||
19333 VT.getVectorElementType() == MVT::i64) &&
19334 "Unexpected element type");
19335 assert((InVT.getVectorElementType() == MVT::i8 ||
19336 InVT.getVectorElementType() == MVT::i16 ||
19337 InVT.getVectorElementType() == MVT::i32) &&
19338 "Unexpected element type");
19340 if (Subtarget.hasInt256())
19341 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19343 // Optimize vectors in AVX mode
19344 // Sign extend v8i16 to v8i32 and
19347 // Divide input vector into two parts
19348 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19349 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19350 // concat the vectors to original VT
19352 unsigned NumElems = InVT.getVectorNumElements();
19353 SDValue Undef = DAG.getUNDEF(InVT);
19355 SmallVector<int,8> ShufMask1(NumElems, -1);
19356 for (unsigned i = 0; i != NumElems/2; ++i)
19359 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19361 SmallVector<int,8> ShufMask2(NumElems, -1);
19362 for (unsigned i = 0; i != NumElems/2; ++i)
19363 ShufMask2[i] = i + NumElems/2;
19365 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19367 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19368 VT.getVectorNumElements() / 2);
19370 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19371 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19373 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19376 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19377 SelectionDAG &DAG) {
19378 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19380 SDValue StoredVal = St->getValue();
19382 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19383 assert(StoredVal.getValueType().isVector() &&
19384 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19385 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19387 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19388 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19389 "Expected AVX512F without AVX512DQI");
19391 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19392 DAG.getUNDEF(MVT::v8i1), StoredVal,
19393 DAG.getIntPtrConstant(0, dl));
19394 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19396 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19397 St->getPointerInfo(), St->getAlignment(),
19398 St->getMemOperand()->getFlags());
19401 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19402 // may emit an illegal shuffle but the expansion is still better than scalar
19403 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19404 // we'll emit a shuffle and a arithmetic shift.
19405 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19406 // TODO: It is possible to support ZExt by zeroing the undef values during
19407 // the shuffle phase or after the shuffle.
19408 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19409 SelectionDAG &DAG) {
19410 MVT RegVT = Op.getSimpleValueType();
19411 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19412 assert(RegVT.isInteger() &&
19413 "We only custom lower integer vector sext loads.");
19415 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19417 EVT MemVT = Ld->getMemoryVT();
19419 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19420 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19421 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19422 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19423 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19424 "Expected AVX512F without AVX512DQI");
19426 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19427 Ld->getPointerInfo(), Ld->getAlignment(),
19428 Ld->getMemOperand()->getFlags());
19430 // Replace chain users with the new chain.
19431 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19432 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19434 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19435 DAG.getBitcast(MVT::v8i1, NewLd),
19436 DAG.getIntPtrConstant(0, dl));
19437 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19440 // Nothing useful we can do without SSE2 shuffles.
19441 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19444 unsigned RegSz = RegVT.getSizeInBits();
19446 ISD::LoadExtType Ext = Ld->getExtensionType();
19448 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19449 && "Only anyext and sext are currently implemented.");
19450 assert(MemVT != RegVT && "Cannot extend to the same type");
19451 assert(MemVT.isVector() && "Must load a vector from memory");
19453 unsigned NumElems = RegVT.getVectorNumElements();
19454 unsigned MemSz = MemVT.getSizeInBits();
19455 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19457 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19458 // The only way in which we have a legal 256-bit vector result but not the
19459 // integer 256-bit operations needed to directly lower a sextload is if we
19460 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19461 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19462 // correctly legalized. We do this late to allow the canonical form of
19463 // sextload to persist throughout the rest of the DAG combiner -- it wants
19464 // to fold together any extensions it can, and so will fuse a sign_extend
19465 // of an sextload into a sextload targeting a wider value.
19467 if (MemSz == 128) {
19468 // Just switch this to a normal load.
19469 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19470 "it must be a legal 128-bit vector "
19472 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19473 Ld->getPointerInfo(), Ld->getAlignment(),
19474 Ld->getMemOperand()->getFlags());
19476 assert(MemSz < 128 &&
19477 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19478 // Do an sext load to a 128-bit vector type. We want to use the same
19479 // number of elements, but elements half as wide. This will end up being
19480 // recursively lowered by this routine, but will succeed as we definitely
19481 // have all the necessary features if we're using AVX1.
19483 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19484 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19486 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19487 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19488 Ld->getMemOperand()->getFlags());
19491 // Replace chain users with the new chain.
19492 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19493 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19495 // Finally, do a normal sign-extend to the desired register.
19496 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19499 // All sizes must be a power of two.
19500 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19501 "Non-power-of-two elements are not custom lowered!");
19503 // Attempt to load the original value using scalar loads.
19504 // Find the largest scalar type that divides the total loaded size.
19505 MVT SclrLoadTy = MVT::i8;
19506 for (MVT Tp : MVT::integer_valuetypes()) {
19507 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19512 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19513 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19515 SclrLoadTy = MVT::f64;
19517 // Calculate the number of scalar loads that we need to perform
19518 // in order to load our vector from memory.
19519 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19521 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19522 "Can only lower sext loads with a single scalar load!");
19524 unsigned loadRegZize = RegSz;
19525 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19528 // If we don't have BWI we won't be able to create the shuffle needed for
19530 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19531 MemVT == MVT::v8i8)
19534 // Represent our vector as a sequence of elements which are the
19535 // largest scalar that we can load.
19536 EVT LoadUnitVecVT = EVT::getVectorVT(
19537 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19539 // Represent the data using the same element type that is stored in
19540 // memory. In practice, we ''widen'' MemVT.
19542 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19543 loadRegZize / MemVT.getScalarSizeInBits());
19545 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19546 "Invalid vector type");
19548 // We can't shuffle using an illegal type.
19549 assert(TLI.isTypeLegal(WideVecVT) &&
19550 "We only lower types that form legal widened vector types");
19552 SmallVector<SDValue, 8> Chains;
19553 SDValue Ptr = Ld->getBasePtr();
19554 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19555 TLI.getPointerTy(DAG.getDataLayout()));
19556 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19558 for (unsigned i = 0; i < NumLoads; ++i) {
19559 // Perform a single load.
19560 SDValue ScalarLoad =
19561 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19562 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19563 Chains.push_back(ScalarLoad.getValue(1));
19564 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19565 // another round of DAGCombining.
19567 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19569 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19570 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19572 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19575 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19577 // Bitcast the loaded value to a vector of the original element type, in
19578 // the size of the target vector type.
19579 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19580 unsigned SizeRatio = RegSz / MemSz;
19582 if (Ext == ISD::SEXTLOAD) {
19583 // If we have SSE4.1, we can directly emit a VSEXT node.
19584 if (Subtarget.hasSSE41()) {
19585 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19586 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19590 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19592 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19593 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19595 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19596 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19600 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19601 MemVT == MVT::v8i8) {
19602 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19603 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19607 // Redistribute the loaded elements into the different locations.
19608 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19609 for (unsigned i = 0; i != NumElems; ++i)
19610 ShuffleVec[i * SizeRatio] = i;
19612 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19613 DAG.getUNDEF(WideVecVT), ShuffleVec);
19615 // Bitcast to the requested type.
19616 Shuff = DAG.getBitcast(RegVT, Shuff);
19617 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19621 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19622 /// each of which has no other use apart from the AND / OR.
19623 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19624 Opc = Op.getOpcode();
19625 if (Opc != ISD::OR && Opc != ISD::AND)
19627 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19628 Op.getOperand(0).hasOneUse() &&
19629 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19630 Op.getOperand(1).hasOneUse());
19633 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19634 /// SETCC node has a single use.
19635 static bool isXor1OfSetCC(SDValue Op) {
19636 if (Op.getOpcode() != ISD::XOR)
19638 if (isOneConstant(Op.getOperand(1)))
19639 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19640 Op.getOperand(0).hasOneUse();
19644 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19645 bool addTest = true;
19646 SDValue Chain = Op.getOperand(0);
19647 SDValue Cond = Op.getOperand(1);
19648 SDValue Dest = Op.getOperand(2);
19651 bool Inverted = false;
19653 if (Cond.getOpcode() == ISD::SETCC) {
19654 // Check for setcc([su]{add,sub,mul}o == 0).
19655 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19656 isNullConstant(Cond.getOperand(1)) &&
19657 Cond.getOperand(0).getResNo() == 1 &&
19658 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19659 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19660 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19661 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19662 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19663 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19665 Cond = Cond.getOperand(0);
19667 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19672 // FIXME: LowerXALUO doesn't handle these!!
19673 else if (Cond.getOpcode() == X86ISD::ADD ||
19674 Cond.getOpcode() == X86ISD::SUB ||
19675 Cond.getOpcode() == X86ISD::SMUL ||
19676 Cond.getOpcode() == X86ISD::UMUL)
19677 Cond = LowerXALUO(Cond, DAG);
19680 // Look pass (and (setcc_carry (cmp ...)), 1).
19681 if (Cond.getOpcode() == ISD::AND &&
19682 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19683 isOneConstant(Cond.getOperand(1)))
19684 Cond = Cond.getOperand(0);
19686 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19687 // setting operand in place of the X86ISD::SETCC.
19688 unsigned CondOpcode = Cond.getOpcode();
19689 if (CondOpcode == X86ISD::SETCC ||
19690 CondOpcode == X86ISD::SETCC_CARRY) {
19691 CC = Cond.getOperand(0);
19693 SDValue Cmp = Cond.getOperand(1);
19694 unsigned Opc = Cmp.getOpcode();
19695 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19696 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19700 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19704 // These can only come from an arithmetic instruction with overflow,
19705 // e.g. SADDO, UADDO.
19706 Cond = Cond.getOperand(1);
19712 CondOpcode = Cond.getOpcode();
19713 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19714 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19715 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19716 Cond.getOperand(0).getValueType() != MVT::i8)) {
19717 SDValue LHS = Cond.getOperand(0);
19718 SDValue RHS = Cond.getOperand(1);
19719 unsigned X86Opcode;
19722 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19723 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19725 switch (CondOpcode) {
19726 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19728 if (isOneConstant(RHS)) {
19729 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19732 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19733 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19735 if (isOneConstant(RHS)) {
19736 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19739 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19740 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19741 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19742 default: llvm_unreachable("unexpected overflowing operator");
19745 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19746 if (CondOpcode == ISD::UMULO)
19747 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19750 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19752 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19754 if (CondOpcode == ISD::UMULO)
19755 Cond = X86Op.getValue(2);
19757 Cond = X86Op.getValue(1);
19759 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19763 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19764 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19765 if (CondOpc == ISD::OR) {
19766 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19767 // two branches instead of an explicit OR instruction with a
19769 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19770 isX86LogicalCmp(Cmp)) {
19771 CC = Cond.getOperand(0).getOperand(0);
19772 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19773 Chain, Dest, CC, Cmp);
19774 CC = Cond.getOperand(1).getOperand(0);
19778 } else { // ISD::AND
19779 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19780 // two branches instead of an explicit AND instruction with a
19781 // separate test. However, we only do this if this block doesn't
19782 // have a fall-through edge, because this requires an explicit
19783 // jmp when the condition is false.
19784 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19785 isX86LogicalCmp(Cmp) &&
19786 Op.getNode()->hasOneUse()) {
19787 X86::CondCode CCode =
19788 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19789 CCode = X86::GetOppositeBranchCondition(CCode);
19790 CC = DAG.getConstant(CCode, dl, MVT::i8);
19791 SDNode *User = *Op.getNode()->use_begin();
19792 // Look for an unconditional branch following this conditional branch.
19793 // We need this because we need to reverse the successors in order
19794 // to implement FCMP_OEQ.
19795 if (User->getOpcode() == ISD::BR) {
19796 SDValue FalseBB = User->getOperand(1);
19798 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19799 assert(NewBR == User);
19803 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19804 Chain, Dest, CC, Cmp);
19805 X86::CondCode CCode =
19806 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19807 CCode = X86::GetOppositeBranchCondition(CCode);
19808 CC = DAG.getConstant(CCode, dl, MVT::i8);
19814 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19815 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19816 // It should be transformed during dag combiner except when the condition
19817 // is set by a arithmetics with overflow node.
19818 X86::CondCode CCode =
19819 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19820 CCode = X86::GetOppositeBranchCondition(CCode);
19821 CC = DAG.getConstant(CCode, dl, MVT::i8);
19822 Cond = Cond.getOperand(0).getOperand(1);
19824 } else if (Cond.getOpcode() == ISD::SETCC &&
19825 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19826 // For FCMP_OEQ, we can emit
19827 // two branches instead of an explicit AND instruction with a
19828 // separate test. However, we only do this if this block doesn't
19829 // have a fall-through edge, because this requires an explicit
19830 // jmp when the condition is false.
19831 if (Op.getNode()->hasOneUse()) {
19832 SDNode *User = *Op.getNode()->use_begin();
19833 // Look for an unconditional branch following this conditional branch.
19834 // We need this because we need to reverse the successors in order
19835 // to implement FCMP_OEQ.
19836 if (User->getOpcode() == ISD::BR) {
19837 SDValue FalseBB = User->getOperand(1);
19839 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19840 assert(NewBR == User);
19844 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19845 Cond.getOperand(0), Cond.getOperand(1));
19846 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19847 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19848 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19849 Chain, Dest, CC, Cmp);
19850 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19855 } else if (Cond.getOpcode() == ISD::SETCC &&
19856 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19857 // For FCMP_UNE, we can emit
19858 // two branches instead of an explicit AND instruction with a
19859 // separate test. However, we only do this if this block doesn't
19860 // have a fall-through edge, because this requires an explicit
19861 // jmp when the condition is false.
19862 if (Op.getNode()->hasOneUse()) {
19863 SDNode *User = *Op.getNode()->use_begin();
19864 // Look for an unconditional branch following this conditional branch.
19865 // We need this because we need to reverse the successors in order
19866 // to implement FCMP_UNE.
19867 if (User->getOpcode() == ISD::BR) {
19868 SDValue FalseBB = User->getOperand(1);
19870 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19871 assert(NewBR == User);
19874 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19875 Cond.getOperand(0), Cond.getOperand(1));
19876 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19877 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19878 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19879 Chain, Dest, CC, Cmp);
19880 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19890 // Look pass the truncate if the high bits are known zero.
19891 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19892 Cond = Cond.getOperand(0);
19894 // We know the result of AND is compared against zero. Try to match
19896 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19897 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19898 CC = NewSetCC.getOperand(0);
19899 Cond = NewSetCC.getOperand(1);
19906 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19907 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19908 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19910 Cond = ConvertCmpIfNecessary(Cond, DAG);
19911 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19912 Chain, Dest, CC, Cond);
19915 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19916 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19917 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19918 // that the guard pages used by the OS virtual memory manager are allocated in
19919 // correct sequence.
19921 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19922 SelectionDAG &DAG) const {
19923 MachineFunction &MF = DAG.getMachineFunction();
19924 bool SplitStack = MF.shouldSplitStack();
19925 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19926 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19927 SplitStack || EmitStackProbe;
19931 SDNode *Node = Op.getNode();
19932 SDValue Chain = Op.getOperand(0);
19933 SDValue Size = Op.getOperand(1);
19934 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19935 EVT VT = Node->getValueType(0);
19937 // Chain the dynamic stack allocation so that it doesn't modify the stack
19938 // pointer when other instructions are using the stack.
19939 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19941 bool Is64Bit = Subtarget.is64Bit();
19942 MVT SPTy = getPointerTy(DAG.getDataLayout());
19946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19947 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19948 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19949 " not tell us which reg is the stack pointer!");
19951 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19952 Chain = SP.getValue(1);
19953 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19954 unsigned StackAlign = TFI.getStackAlignment();
19955 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19956 if (Align > StackAlign)
19957 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19958 DAG.getConstant(-(uint64_t)Align, dl, VT));
19959 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19960 } else if (SplitStack) {
19961 MachineRegisterInfo &MRI = MF.getRegInfo();
19964 // The 64 bit implementation of segmented stacks needs to clobber both r10
19965 // r11. This makes it impossible to use it along with nested parameters.
19966 const Function &F = MF.getFunction();
19967 for (const auto &A : F.args()) {
19968 if (A.hasNestAttr())
19969 report_fatal_error("Cannot use segmented stacks with functions that "
19970 "have nested arguments.");
19974 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19975 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19976 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19977 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19978 DAG.getRegister(Vreg, SPTy));
19980 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19981 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19982 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19984 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19985 unsigned SPReg = RegInfo->getStackRegister();
19986 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19987 Chain = SP.getValue(1);
19990 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19991 DAG.getConstant(-(uint64_t)Align, dl, VT));
19992 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19998 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19999 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
20001 SDValue Ops[2] = {Result, Chain};
20002 return DAG.getMergeValues(Ops, dl);
20005 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
20006 MachineFunction &MF = DAG.getMachineFunction();
20007 auto PtrVT = getPointerTy(MF.getDataLayout());
20008 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20010 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20013 if (!Subtarget.is64Bit() ||
20014 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
20015 // vastart just stores the address of the VarArgsFrameIndex slot into the
20016 // memory location argument.
20017 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20018 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
20019 MachinePointerInfo(SV));
20023 // gp_offset (0 - 6 * 8)
20024 // fp_offset (48 - 48 + 8 * 16)
20025 // overflow_arg_area (point to parameters coming in memory).
20027 SmallVector<SDValue, 8> MemOps;
20028 SDValue FIN = Op.getOperand(1);
20030 SDValue Store = DAG.getStore(
20031 Op.getOperand(0), DL,
20032 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
20033 MachinePointerInfo(SV));
20034 MemOps.push_back(Store);
20037 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
20038 Store = DAG.getStore(
20039 Op.getOperand(0), DL,
20040 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
20041 MachinePointerInfo(SV, 4));
20042 MemOps.push_back(Store);
20044 // Store ptr to overflow_arg_area
20045 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
20046 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20048 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
20049 MemOps.push_back(Store);
20051 // Store ptr to reg_save_area.
20052 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
20053 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
20054 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
20055 Store = DAG.getStore(
20056 Op.getOperand(0), DL, RSFIN, FIN,
20057 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
20058 MemOps.push_back(Store);
20059 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
20062 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
20063 assert(Subtarget.is64Bit() &&
20064 "LowerVAARG only handles 64-bit va_arg!");
20065 assert(Op.getNumOperands() == 4);
20067 MachineFunction &MF = DAG.getMachineFunction();
20068 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
20069 // The Win64 ABI uses char* instead of a structure.
20070 return DAG.expandVAArg(Op.getNode());
20072 SDValue Chain = Op.getOperand(0);
20073 SDValue SrcPtr = Op.getOperand(1);
20074 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20075 unsigned Align = Op.getConstantOperandVal(3);
20078 EVT ArgVT = Op.getNode()->getValueType(0);
20079 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20080 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
20083 // Decide which area this value should be read from.
20084 // TODO: Implement the AMD64 ABI in its entirety. This simple
20085 // selection mechanism works only for the basic types.
20086 if (ArgVT == MVT::f80) {
20087 llvm_unreachable("va_arg for f80 not yet implemented");
20088 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
20089 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
20090 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
20091 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
20093 llvm_unreachable("Unhandled argument type in LowerVAARG");
20096 if (ArgMode == 2) {
20097 // Sanity Check: Make sure using fp_offset makes sense.
20098 assert(!Subtarget.useSoftFloat() &&
20099 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
20100 Subtarget.hasSSE1());
20103 // Insert VAARG_64 node into the DAG
20104 // VAARG_64 returns two values: Variable Argument Address, Chain
20105 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
20106 DAG.getConstant(ArgMode, dl, MVT::i8),
20107 DAG.getConstant(Align, dl, MVT::i32)};
20108 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
20109 SDValue VAARG = DAG.getMemIntrinsicNode(
20110 X86ISD::VAARG_64, dl,
20111 VTs, InstOps, MVT::i64,
20112 MachinePointerInfo(SV),
20114 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
20115 Chain = VAARG.getValue(1);
20117 // Load the next argument and return it
20118 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
20121 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
20122 SelectionDAG &DAG) {
20123 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
20124 // where a va_list is still an i8*.
20125 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
20126 if (Subtarget.isCallingConvWin64(
20127 DAG.getMachineFunction().getFunction().getCallingConv()))
20128 // Probably a Win64 va_copy.
20129 return DAG.expandVACopy(Op.getNode());
20131 SDValue Chain = Op.getOperand(0);
20132 SDValue DstPtr = Op.getOperand(1);
20133 SDValue SrcPtr = Op.getOperand(2);
20134 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
20135 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20138 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
20139 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
20141 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
20144 /// Handle vector element shifts where the shift amount is a constant.
20145 /// Takes immediate version of shift as input.
20146 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
20147 SDValue SrcOp, uint64_t ShiftAmt,
20148 SelectionDAG &DAG) {
20149 MVT ElementType = VT.getVectorElementType();
20151 // Bitcast the source vector to the output type, this is mainly necessary for
20152 // vXi8/vXi64 shifts.
20153 if (VT != SrcOp.getSimpleValueType())
20154 SrcOp = DAG.getBitcast(VT, SrcOp);
20156 // Fold this packed shift into its first operand if ShiftAmt is 0.
20160 // Check for ShiftAmt >= element width
20161 if (ShiftAmt >= ElementType.getSizeInBits()) {
20162 if (Opc == X86ISD::VSRAI)
20163 ShiftAmt = ElementType.getSizeInBits() - 1;
20165 return DAG.getConstant(0, dl, VT);
20168 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
20169 && "Unknown target vector shift-by-constant node");
20171 // Fold this packed vector shift into a build vector if SrcOp is a
20172 // vector of Constants or UNDEFs.
20173 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
20174 SmallVector<SDValue, 8> Elts;
20175 unsigned NumElts = SrcOp->getNumOperands();
20176 ConstantSDNode *ND;
20179 default: llvm_unreachable("Unknown opcode!");
20180 case X86ISD::VSHLI:
20181 for (unsigned i=0; i!=NumElts; ++i) {
20182 SDValue CurrentOp = SrcOp->getOperand(i);
20183 if (CurrentOp->isUndef()) {
20184 Elts.push_back(CurrentOp);
20187 ND = cast<ConstantSDNode>(CurrentOp);
20188 const APInt &C = ND->getAPIntValue();
20189 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
20192 case X86ISD::VSRLI:
20193 for (unsigned i=0; i!=NumElts; ++i) {
20194 SDValue CurrentOp = SrcOp->getOperand(i);
20195 if (CurrentOp->isUndef()) {
20196 Elts.push_back(CurrentOp);
20199 ND = cast<ConstantSDNode>(CurrentOp);
20200 const APInt &C = ND->getAPIntValue();
20201 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
20204 case X86ISD::VSRAI:
20205 for (unsigned i=0; i!=NumElts; ++i) {
20206 SDValue CurrentOp = SrcOp->getOperand(i);
20207 if (CurrentOp->isUndef()) {
20208 Elts.push_back(CurrentOp);
20211 ND = cast<ConstantSDNode>(CurrentOp);
20212 const APInt &C = ND->getAPIntValue();
20213 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
20218 return DAG.getBuildVector(VT, dl, Elts);
20221 return DAG.getNode(Opc, dl, VT, SrcOp,
20222 DAG.getConstant(ShiftAmt, dl, MVT::i8));
20225 /// Handle vector element shifts where the shift amount may or may not be a
20226 /// constant. Takes immediate version of shift as input.
20227 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
20228 SDValue SrcOp, SDValue ShAmt,
20229 const X86Subtarget &Subtarget,
20230 SelectionDAG &DAG) {
20231 MVT SVT = ShAmt.getSimpleValueType();
20232 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
20234 // Catch shift-by-constant.
20235 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
20236 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
20237 CShAmt->getZExtValue(), DAG);
20239 // Change opcode to non-immediate version
20241 default: llvm_unreachable("Unknown target vector shift node");
20242 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
20243 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
20244 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
20247 // Need to build a vector containing shift amount.
20248 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
20249 // +=================+============+=======================================+
20250 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20251 // +=================+============+=======================================+
20252 // | i64 | Yes, No | Use ShAmt as lowest elt |
20253 // | i32 | Yes | zero-extend in-reg |
20254 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20255 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20256 // +=================+============+=======================================+
20258 if (SVT == MVT::i64)
20259 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20260 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20261 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20262 ShAmt = ShAmt.getOperand(0);
20263 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20264 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20265 } else if (Subtarget.hasSSE41() &&
20266 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20267 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20268 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20270 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20271 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20272 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20275 // The return type has to be a 128-bit type with the same element
20276 // type as the input type.
20277 MVT EltVT = VT.getVectorElementType();
20278 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20280 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20281 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20284 /// Return Mask with the necessary casting or extending
20285 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20286 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20287 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20290 if (isAllOnesConstant(Mask))
20291 return DAG.getConstant(1, dl, MaskVT);
20292 if (X86::isZeroNode(Mask))
20293 return DAG.getConstant(0, dl, MaskVT);
20295 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20296 // Mask should be extended
20297 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20298 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20301 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20302 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20303 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20304 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20306 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20307 DAG.getConstant(0, dl, MVT::i32));
20308 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20309 DAG.getConstant(1, dl, MVT::i32));
20311 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20312 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20314 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20316 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20317 Mask.getSimpleValueType().getSizeInBits());
20318 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20319 // are extracted by EXTRACT_SUBVECTOR.
20320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20321 DAG.getBitcast(BitcastVT, Mask),
20322 DAG.getIntPtrConstant(0, dl));
20326 /// Return (and \p Op, \p Mask) for compare instructions or
20327 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20328 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20329 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20330 SDValue PreservedSrc,
20331 const X86Subtarget &Subtarget,
20332 SelectionDAG &DAG) {
20333 MVT VT = Op.getSimpleValueType();
20334 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20335 unsigned OpcodeSelect = ISD::VSELECT;
20338 if (isAllOnesConstant(Mask))
20341 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20343 switch (Op.getOpcode()) {
20346 case X86ISD::CMPM_RND:
20347 case X86ISD::VPSHUFBITQMB:
20348 case X86ISD::VFPCLASS:
20349 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20350 case ISD::TRUNCATE:
20351 case X86ISD::VTRUNC:
20352 case X86ISD::VTRUNCS:
20353 case X86ISD::VTRUNCUS:
20354 case X86ISD::CVTPS2PH:
20355 // We can't use ISD::VSELECT here because it is not always "Legal"
20356 // for the destination type. For example vpmovqb require only AVX512
20357 // and vselect that can operate on byte element type require BWI
20358 OpcodeSelect = X86ISD::SELECT;
20361 if (PreservedSrc.isUndef())
20362 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20363 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20366 /// Creates an SDNode for a predicated scalar operation.
20367 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20368 /// The mask is coming as MVT::i8 and it should be transformed
20369 /// to MVT::v1i1 while lowering masking intrinsics.
20370 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20371 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20372 /// for a scalar instruction.
20373 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20374 SDValue PreservedSrc,
20375 const X86Subtarget &Subtarget,
20376 SelectionDAG &DAG) {
20378 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20379 if (MaskConst->getZExtValue() & 0x1)
20382 MVT VT = Op.getSimpleValueType();
20385 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20386 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20387 if (Op.getOpcode() == X86ISD::FSETCCM ||
20388 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20389 Op.getOpcode() == X86ISD::VFPCLASSS)
20390 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20392 if (PreservedSrc.isUndef())
20393 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20394 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20397 static int getSEHRegistrationNodeSize(const Function *Fn) {
20398 if (!Fn->hasPersonalityFn())
20399 report_fatal_error(
20400 "querying registration node size for function without personality");
20401 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20402 // WinEHStatePass for the full struct definition.
20403 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20404 case EHPersonality::MSVC_X86SEH: return 24;
20405 case EHPersonality::MSVC_CXX: return 16;
20408 report_fatal_error(
20409 "can only recover FP for 32-bit MSVC EH personality functions");
20412 /// When the MSVC runtime transfers control to us, either to an outlined
20413 /// function or when returning to a parent frame after catching an exception, we
20414 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20415 /// Here's the math:
20416 /// RegNodeBase = EntryEBP - RegNodeSize
20417 /// ParentFP = RegNodeBase - ParentFrameOffset
20418 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20419 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20420 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20421 SDValue EntryEBP) {
20422 MachineFunction &MF = DAG.getMachineFunction();
20425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20426 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20428 // It's possible that the parent function no longer has a personality function
20429 // if the exceptional code was optimized away, in which case we just return
20430 // the incoming EBP.
20431 if (!Fn->hasPersonalityFn())
20434 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20435 // registration, or the .set_setframe offset.
20436 MCSymbol *OffsetSym =
20437 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20438 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20439 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20440 SDValue ParentFrameOffset =
20441 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20443 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20444 // prologue to RBP in the parent function.
20445 const X86Subtarget &Subtarget =
20446 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20447 if (Subtarget.is64Bit())
20448 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20450 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20451 // RegNodeBase = EntryEBP - RegNodeSize
20452 // ParentFP = RegNodeBase - ParentFrameOffset
20453 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20454 DAG.getConstant(RegNodeSize, dl, PtrVT));
20455 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20458 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20459 SelectionDAG &DAG) const {
20460 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20461 auto isRoundModeCurDirection = [](SDValue Rnd) {
20462 if (!isa<ConstantSDNode>(Rnd))
20465 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20466 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20470 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20471 MVT VT = Op.getSimpleValueType();
20472 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20474 switch(IntrData->Type) {
20475 case INTR_TYPE_1OP: {
20476 // We specify 2 possible opcodes for intrinsics with rounding modes.
20477 // First, we check if the intrinsic may have non-default rounding mode,
20478 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20479 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20480 if (IntrWithRoundingModeOpcode != 0) {
20481 SDValue Rnd = Op.getOperand(2);
20482 if (!isRoundModeCurDirection(Rnd)) {
20483 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20484 Op.getOperand(1), Rnd);
20487 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20489 case INTR_TYPE_2OP:
20490 case INTR_TYPE_2OP_IMM8: {
20491 SDValue Src2 = Op.getOperand(2);
20493 if (IntrData->Type == INTR_TYPE_2OP_IMM8)
20494 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20496 // We specify 2 possible opcodes for intrinsics with rounding modes.
20497 // First, we check if the intrinsic may have non-default rounding mode,
20498 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20499 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20500 if (IntrWithRoundingModeOpcode != 0) {
20501 SDValue Rnd = Op.getOperand(3);
20502 if (!isRoundModeCurDirection(Rnd)) {
20503 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20504 Op.getOperand(1), Src2, Rnd);
20508 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20509 Op.getOperand(1), Src2);
20511 case INTR_TYPE_3OP:
20512 case INTR_TYPE_3OP_IMM8: {
20513 SDValue Src1 = Op.getOperand(1);
20514 SDValue Src2 = Op.getOperand(2);
20515 SDValue Src3 = Op.getOperand(3);
20517 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
20518 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20520 // We specify 2 possible opcodes for intrinsics with rounding modes.
20521 // First, we check if the intrinsic may have non-default rounding mode,
20522 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20523 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20524 if (IntrWithRoundingModeOpcode != 0) {
20525 SDValue Rnd = Op.getOperand(4);
20526 if (!isRoundModeCurDirection(Rnd)) {
20527 return DAG.getNode(IntrWithRoundingModeOpcode,
20528 dl, Op.getValueType(),
20529 Src1, Src2, Src3, Rnd);
20533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20536 case INTR_TYPE_4OP:
20537 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20538 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20539 case INTR_TYPE_1OP_MASK_RM: {
20540 SDValue Src = Op.getOperand(1);
20541 SDValue PassThru = Op.getOperand(2);
20542 SDValue Mask = Op.getOperand(3);
20543 SDValue RoundingMode;
20544 // We always add rounding mode to the Node.
20545 // If the rounding mode is not specified, we add the
20546 // "current direction" mode.
20547 if (Op.getNumOperands() == 4)
20549 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20551 RoundingMode = Op.getOperand(4);
20552 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20553 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20555 Mask, PassThru, Subtarget, DAG);
20557 case INTR_TYPE_1OP_MASK: {
20558 SDValue Src = Op.getOperand(1);
20559 SDValue PassThru = Op.getOperand(2);
20560 SDValue Mask = Op.getOperand(3);
20561 // We add rounding mode to the Node when
20562 // - RM Opcode is specified and
20563 // - RM is not "current direction".
20564 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20565 if (IntrWithRoundingModeOpcode != 0) {
20566 SDValue Rnd = Op.getOperand(4);
20567 if (!isRoundModeCurDirection(Rnd)) {
20568 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20569 dl, Op.getValueType(),
20571 Mask, PassThru, Subtarget, DAG);
20574 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20575 Mask, PassThru, Subtarget, DAG);
20577 case INTR_TYPE_SCALAR_MASK: {
20578 SDValue Src1 = Op.getOperand(1);
20579 SDValue Src2 = Op.getOperand(2);
20580 SDValue passThru = Op.getOperand(3);
20581 SDValue Mask = Op.getOperand(4);
20582 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20583 // There are 2 kinds of intrinsics in this group:
20584 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20585 // (2) With rounding mode and sae - 7 operands.
20586 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20587 if (Op.getNumOperands() == (5U + HasRounding)) {
20589 SDValue Rnd = Op.getOperand(5);
20590 if (!isRoundModeCurDirection(Rnd))
20591 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20592 dl, VT, Src1, Src2, Rnd),
20593 Mask, passThru, Subtarget, DAG);
20595 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20597 Mask, passThru, Subtarget, DAG);
20600 assert(Op.getNumOperands() == (6U + HasRounding) &&
20601 "Unexpected intrinsic form");
20602 SDValue RoundingMode = Op.getOperand(5);
20604 SDValue Sae = Op.getOperand(6);
20605 if (!isRoundModeCurDirection(Sae))
20606 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20607 dl, VT, Src1, Src2,
20608 RoundingMode, Sae),
20609 Mask, passThru, Subtarget, DAG);
20611 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20612 Src2, RoundingMode),
20613 Mask, passThru, Subtarget, DAG);
20615 case INTR_TYPE_SCALAR_MASK_RM: {
20616 SDValue Src1 = Op.getOperand(1);
20617 SDValue Src2 = Op.getOperand(2);
20618 SDValue Src0 = Op.getOperand(3);
20619 SDValue Mask = Op.getOperand(4);
20620 // There are 2 kinds of intrinsics in this group:
20621 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20622 // (2) With rounding mode and sae - 7 operands.
20623 if (Op.getNumOperands() == 6) {
20624 SDValue Sae = Op.getOperand(5);
20625 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20627 Mask, Src0, Subtarget, DAG);
20629 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20630 SDValue RoundingMode = Op.getOperand(5);
20631 SDValue Sae = Op.getOperand(6);
20632 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20633 RoundingMode, Sae),
20634 Mask, Src0, Subtarget, DAG);
20636 case INTR_TYPE_2OP_MASK: {
20637 SDValue Src1 = Op.getOperand(1);
20638 SDValue Src2 = Op.getOperand(2);
20639 SDValue PassThru = Op.getOperand(3);
20640 SDValue Mask = Op.getOperand(4);
20642 // We specify 2 possible opcodes for intrinsics with rounding modes.
20643 // First, we check if the intrinsic may have non-default rounding mode,
20644 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20645 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20646 if (IntrWithRoundingModeOpcode != 0) {
20647 SDValue Rnd = Op.getOperand(5);
20648 if (!isRoundModeCurDirection(Rnd)) {
20649 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20650 dl, Op.getValueType(),
20652 Mask, PassThru, Subtarget, DAG);
20655 // TODO: Intrinsics should have fast-math-flags to propagate.
20656 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20657 Mask, PassThru, Subtarget, DAG);
20659 case INTR_TYPE_2OP_MASK_RM: {
20660 SDValue Src1 = Op.getOperand(1);
20661 SDValue Src2 = Op.getOperand(2);
20662 SDValue PassThru = Op.getOperand(3);
20663 SDValue Mask = Op.getOperand(4);
20664 // We specify 2 possible modes for intrinsics, with/without rounding
20666 // First, we check if the intrinsic have rounding mode (6 operands),
20667 // if not, we set rounding mode to "current".
20669 if (Op.getNumOperands() == 6)
20670 Rnd = Op.getOperand(5);
20672 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20673 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20675 Mask, PassThru, Subtarget, DAG);
20677 case INTR_TYPE_3OP_SCALAR_MASK: {
20678 SDValue Src1 = Op.getOperand(1);
20679 SDValue Src2 = Op.getOperand(2);
20680 SDValue Src3 = Op.getOperand(3);
20681 SDValue PassThru = Op.getOperand(4);
20682 SDValue Mask = Op.getOperand(5);
20684 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20685 if (IntrWithRoundingModeOpcode != 0) {
20686 SDValue Rnd = Op.getOperand(6);
20687 if (!isRoundModeCurDirection(Rnd))
20688 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20689 dl, VT, Src1, Src2, Src3, Rnd),
20690 Mask, PassThru, Subtarget, DAG);
20692 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20694 Mask, PassThru, Subtarget, DAG);
20696 case INTR_TYPE_3OP_MASK: {
20697 SDValue Src1 = Op.getOperand(1);
20698 SDValue Src2 = Op.getOperand(2);
20699 SDValue Src3 = Op.getOperand(3);
20700 SDValue PassThru = Op.getOperand(4);
20701 SDValue Mask = Op.getOperand(5);
20703 // We specify 2 possible opcodes for intrinsics with rounding modes.
20704 // First, we check if the intrinsic may have non-default rounding mode,
20705 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20706 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20707 if (IntrWithRoundingModeOpcode != 0) {
20708 SDValue Rnd = Op.getOperand(6);
20709 if (!isRoundModeCurDirection(Rnd)) {
20710 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20711 dl, Op.getValueType(),
20712 Src1, Src2, Src3, Rnd),
20713 Mask, PassThru, Subtarget, DAG);
20716 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20718 Mask, PassThru, Subtarget, DAG);
20721 SDValue Src1 = Op.getOperand(1);
20722 SDValue Src2 = Op.getOperand(2);
20724 // Swap Src1 and Src2 in the node creation
20725 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
20728 case FMA_OP_MASK: {
20729 SDValue Src1 = Op.getOperand(1);
20730 SDValue Src2 = Op.getOperand(2);
20731 SDValue Src3 = Op.getOperand(3);
20732 SDValue Mask = Op.getOperand(4);
20733 MVT VT = Op.getSimpleValueType();
20734 SDValue PassThru = SDValue();
20736 // set PassThru element
20737 if (IntrData->Type == FMA_OP_MASKZ)
20738 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20742 // We specify 2 possible opcodes for intrinsics with rounding modes.
20743 // First, we check if the intrinsic may have non-default rounding mode,
20744 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20745 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20746 if (IntrWithRoundingModeOpcode != 0) {
20747 SDValue Rnd = Op.getOperand(5);
20748 if (!isRoundModeCurDirection(Rnd))
20749 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20750 dl, Op.getValueType(),
20751 Src1, Src2, Src3, Rnd),
20752 Mask, PassThru, Subtarget, DAG);
20754 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20755 dl, Op.getValueType(),
20757 Mask, PassThru, Subtarget, DAG);
20760 // NOTE: We need to swizzle the operands to pass the multiply operands
20762 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20763 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
20765 // ISD::FP_ROUND has a second argument that indicates if the truncation
20766 // does not change the value. Set it to 0 since it can change.
20767 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20768 DAG.getIntPtrConstant(0, dl));
20769 case CVTPD2PS_MASK: {
20770 SDValue Src = Op.getOperand(1);
20771 SDValue PassThru = Op.getOperand(2);
20772 SDValue Mask = Op.getOperand(3);
20773 // We add rounding mode to the Node when
20774 // - RM Opcode is specified and
20775 // - RM is not "current direction".
20776 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20777 if (IntrWithRoundingModeOpcode != 0) {
20778 SDValue Rnd = Op.getOperand(4);
20779 if (!isRoundModeCurDirection(Rnd)) {
20780 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20781 dl, Op.getValueType(),
20783 Mask, PassThru, Subtarget, DAG);
20786 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20787 // ISD::FP_ROUND has a second argument that indicates if the truncation
20788 // does not change the value. Set it to 0 since it can change.
20789 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20790 DAG.getIntPtrConstant(0, dl)),
20791 Mask, PassThru, Subtarget, DAG);
20794 // FPclass intrinsics
20795 SDValue Src1 = Op.getOperand(1);
20796 MVT MaskVT = Op.getSimpleValueType();
20797 SDValue Imm = Op.getOperand(2);
20798 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20801 SDValue Src1 = Op.getOperand(1);
20802 SDValue Imm = Op.getOperand(2);
20803 SDValue Mask = Op.getOperand(3);
20804 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20805 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20807 // Need to fill with zeros to ensure the bitcast will produce zeroes
20808 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20809 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20810 DAG.getConstant(0, dl, MVT::v8i1),
20811 FPclassMask, DAG.getIntPtrConstant(0, dl));
20812 return DAG.getBitcast(MVT::i8, Ins);
20815 // Comparison intrinsics with masks.
20816 // Example of transformation:
20817 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20818 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20820 // (v8i1 (insert_subvector zero,
20821 // (v2i1 (and (PCMPEQM %a, %b),
20822 // (extract_subvector
20823 // (v8i1 (bitcast %mask)), 0))), 0))))
20824 MVT VT = Op.getOperand(1).getSimpleValueType();
20825 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20826 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20827 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20828 Mask.getSimpleValueType().getSizeInBits());
20829 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20831 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20833 // Need to fill with zeros to ensure the bitcast will produce zeroes
20834 // for the upper bits in the v2i1/v4i1 case.
20835 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20836 DAG.getConstant(0, dl, BitcastVT),
20837 CmpMask, DAG.getIntPtrConstant(0, dl));
20838 return DAG.getBitcast(Op.getValueType(), Res);
20841 case CMP_MASK_CC: {
20842 MVT MaskVT = Op.getSimpleValueType();
20844 SDValue CC = Op.getOperand(3);
20845 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20846 // We specify 2 possible opcodes for intrinsics with rounding modes.
20847 // First, we check if the intrinsic may have non-default rounding mode,
20848 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20849 if (IntrData->Opc1 != 0) {
20850 SDValue Rnd = Op.getOperand(4);
20851 if (!isRoundModeCurDirection(Rnd))
20852 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20853 Op.getOperand(2), CC, Rnd);
20855 //default rounding mode
20856 if (!Cmp.getNode())
20857 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20858 Op.getOperand(2), CC);
20862 case CMP_MASK_SCALAR_CC: {
20863 SDValue Src1 = Op.getOperand(1);
20864 SDValue Src2 = Op.getOperand(2);
20865 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20866 SDValue Mask = Op.getOperand(4);
20869 if (IntrData->Opc1 != 0) {
20870 SDValue Rnd = Op.getOperand(5);
20871 if (!isRoundModeCurDirection(Rnd))
20872 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20874 //default rounding mode
20876 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20878 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20880 // Need to fill with zeros to ensure the bitcast will produce zeroes
20881 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20882 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20883 DAG.getConstant(0, dl, MVT::v8i1),
20884 CmpMask, DAG.getIntPtrConstant(0, dl));
20885 return DAG.getBitcast(MVT::i8, Ins);
20887 case COMI: { // Comparison intrinsics
20888 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20889 SDValue LHS = Op.getOperand(1);
20890 SDValue RHS = Op.getOperand(2);
20891 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20892 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20895 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20896 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20897 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20898 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20901 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20902 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20903 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20904 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20907 case ISD::SETGT: // (CF = 0 and ZF = 0)
20908 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20910 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20911 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20914 case ISD::SETGE: // CF = 0
20915 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20917 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20918 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20921 llvm_unreachable("Unexpected illegal condition!");
20923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20925 case COMI_RM: { // Comparison intrinsics with Sae
20926 SDValue LHS = Op.getOperand(1);
20927 SDValue RHS = Op.getOperand(2);
20928 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20929 SDValue Sae = Op.getOperand(4);
20932 if (isRoundModeCurDirection(Sae))
20933 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20934 DAG.getConstant(CondVal, dl, MVT::i8));
20936 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20937 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20938 // Need to fill with zeros to ensure the bitcast will produce zeroes
20939 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20940 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20941 DAG.getConstant(0, dl, MVT::v16i1),
20942 FCmp, DAG.getIntPtrConstant(0, dl));
20943 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20944 DAG.getBitcast(MVT::i16, Ins));
20947 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20948 Op.getOperand(1), Op.getOperand(2), Subtarget,
20950 case COMPRESS_EXPAND_IN_REG: {
20951 SDValue Mask = Op.getOperand(3);
20952 SDValue DataToCompress = Op.getOperand(1);
20953 SDValue PassThru = Op.getOperand(2);
20954 if (isAllOnesConstant(Mask)) // return data as is
20955 return Op.getOperand(1);
20957 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20959 Mask, PassThru, Subtarget, DAG);
20962 case FIXUPIMMS_MASKZ:
20964 case FIXUPIMM_MASKZ:{
20965 SDValue Src1 = Op.getOperand(1);
20966 SDValue Src2 = Op.getOperand(2);
20967 SDValue Src3 = Op.getOperand(3);
20968 SDValue Imm = Op.getOperand(4);
20969 SDValue Mask = Op.getOperand(5);
20970 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20971 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20972 // We specify 2 possible modes for intrinsics, with/without rounding
20974 // First, we check if the intrinsic have rounding mode (7 operands),
20975 // if not, we set rounding mode to "current".
20977 if (Op.getNumOperands() == 7)
20978 Rnd = Op.getOperand(6);
20980 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20981 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20982 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20983 Src1, Src2, Src3, Imm, Rnd),
20984 Mask, Passthru, Subtarget, DAG);
20985 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20986 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20987 Src1, Src2, Src3, Imm, Rnd),
20988 Mask, Passthru, Subtarget, DAG);
20991 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20992 // Clear the upper bits of the rounding immediate so that the legacy
20993 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20994 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20996 DAG.getConstant(0xf, dl, MVT::i32));
20997 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20998 Op.getOperand(1), RoundingMode);
21001 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
21002 // Clear the upper bits of the rounding immediate so that the legacy
21003 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
21004 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
21006 DAG.getConstant(0xf, dl, MVT::i32));
21007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21008 Op.getOperand(1), Op.getOperand(2), RoundingMode);
21016 default: return SDValue(); // Don't custom lower most intrinsics.
21018 // ptest and testp intrinsics. The intrinsic these come from are designed to
21019 // return an integer value, not just an instruction so lower it to the ptest
21020 // or testp pattern and a setcc for the result.
21021 case Intrinsic::x86_sse41_ptestz:
21022 case Intrinsic::x86_sse41_ptestc:
21023 case Intrinsic::x86_sse41_ptestnzc:
21024 case Intrinsic::x86_avx_ptestz_256:
21025 case Intrinsic::x86_avx_ptestc_256:
21026 case Intrinsic::x86_avx_ptestnzc_256:
21027 case Intrinsic::x86_avx_vtestz_ps:
21028 case Intrinsic::x86_avx_vtestc_ps:
21029 case Intrinsic::x86_avx_vtestnzc_ps:
21030 case Intrinsic::x86_avx_vtestz_pd:
21031 case Intrinsic::x86_avx_vtestc_pd:
21032 case Intrinsic::x86_avx_vtestnzc_pd:
21033 case Intrinsic::x86_avx_vtestz_ps_256:
21034 case Intrinsic::x86_avx_vtestc_ps_256:
21035 case Intrinsic::x86_avx_vtestnzc_ps_256:
21036 case Intrinsic::x86_avx_vtestz_pd_256:
21037 case Intrinsic::x86_avx_vtestc_pd_256:
21038 case Intrinsic::x86_avx_vtestnzc_pd_256: {
21039 bool IsTestPacked = false;
21040 X86::CondCode X86CC;
21042 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
21043 case Intrinsic::x86_avx_vtestz_ps:
21044 case Intrinsic::x86_avx_vtestz_pd:
21045 case Intrinsic::x86_avx_vtestz_ps_256:
21046 case Intrinsic::x86_avx_vtestz_pd_256:
21047 IsTestPacked = true;
21049 case Intrinsic::x86_sse41_ptestz:
21050 case Intrinsic::x86_avx_ptestz_256:
21052 X86CC = X86::COND_E;
21054 case Intrinsic::x86_avx_vtestc_ps:
21055 case Intrinsic::x86_avx_vtestc_pd:
21056 case Intrinsic::x86_avx_vtestc_ps_256:
21057 case Intrinsic::x86_avx_vtestc_pd_256:
21058 IsTestPacked = true;
21060 case Intrinsic::x86_sse41_ptestc:
21061 case Intrinsic::x86_avx_ptestc_256:
21063 X86CC = X86::COND_B;
21065 case Intrinsic::x86_avx_vtestnzc_ps:
21066 case Intrinsic::x86_avx_vtestnzc_pd:
21067 case Intrinsic::x86_avx_vtestnzc_ps_256:
21068 case Intrinsic::x86_avx_vtestnzc_pd_256:
21069 IsTestPacked = true;
21071 case Intrinsic::x86_sse41_ptestnzc:
21072 case Intrinsic::x86_avx_ptestnzc_256:
21074 X86CC = X86::COND_A;
21078 SDValue LHS = Op.getOperand(1);
21079 SDValue RHS = Op.getOperand(2);
21080 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
21081 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
21082 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
21083 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21086 case Intrinsic::x86_sse42_pcmpistria128:
21087 case Intrinsic::x86_sse42_pcmpestria128:
21088 case Intrinsic::x86_sse42_pcmpistric128:
21089 case Intrinsic::x86_sse42_pcmpestric128:
21090 case Intrinsic::x86_sse42_pcmpistrio128:
21091 case Intrinsic::x86_sse42_pcmpestrio128:
21092 case Intrinsic::x86_sse42_pcmpistris128:
21093 case Intrinsic::x86_sse42_pcmpestris128:
21094 case Intrinsic::x86_sse42_pcmpistriz128:
21095 case Intrinsic::x86_sse42_pcmpestriz128: {
21097 X86::CondCode X86CC;
21099 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21100 case Intrinsic::x86_sse42_pcmpistria128:
21101 Opcode = X86ISD::PCMPISTR;
21102 X86CC = X86::COND_A;
21104 case Intrinsic::x86_sse42_pcmpestria128:
21105 Opcode = X86ISD::PCMPESTR;
21106 X86CC = X86::COND_A;
21108 case Intrinsic::x86_sse42_pcmpistric128:
21109 Opcode = X86ISD::PCMPISTR;
21110 X86CC = X86::COND_B;
21112 case Intrinsic::x86_sse42_pcmpestric128:
21113 Opcode = X86ISD::PCMPESTR;
21114 X86CC = X86::COND_B;
21116 case Intrinsic::x86_sse42_pcmpistrio128:
21117 Opcode = X86ISD::PCMPISTR;
21118 X86CC = X86::COND_O;
21120 case Intrinsic::x86_sse42_pcmpestrio128:
21121 Opcode = X86ISD::PCMPESTR;
21122 X86CC = X86::COND_O;
21124 case Intrinsic::x86_sse42_pcmpistris128:
21125 Opcode = X86ISD::PCMPISTR;
21126 X86CC = X86::COND_S;
21128 case Intrinsic::x86_sse42_pcmpestris128:
21129 Opcode = X86ISD::PCMPESTR;
21130 X86CC = X86::COND_S;
21132 case Intrinsic::x86_sse42_pcmpistriz128:
21133 Opcode = X86ISD::PCMPISTR;
21134 X86CC = X86::COND_E;
21136 case Intrinsic::x86_sse42_pcmpestriz128:
21137 Opcode = X86ISD::PCMPESTR;
21138 X86CC = X86::COND_E;
21141 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21142 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21143 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
21144 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
21145 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21148 case Intrinsic::x86_sse42_pcmpistri128:
21149 case Intrinsic::x86_sse42_pcmpestri128: {
21151 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
21152 Opcode = X86ISD::PCMPISTR;
21154 Opcode = X86ISD::PCMPESTR;
21156 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21157 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21158 return DAG.getNode(Opcode, dl, VTs, NewOps);
21161 case Intrinsic::x86_sse42_pcmpistrm128:
21162 case Intrinsic::x86_sse42_pcmpestrm128: {
21164 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
21165 Opcode = X86ISD::PCMPISTR;
21167 Opcode = X86ISD::PCMPESTR;
21169 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21170 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21171 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
21174 case Intrinsic::eh_sjlj_lsda: {
21175 MachineFunction &MF = DAG.getMachineFunction();
21176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21177 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21178 auto &Context = MF.getMMI().getContext();
21179 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21180 Twine(MF.getFunctionNumber()));
21181 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21182 DAG.getMCSymbol(S, PtrVT));
21185 case Intrinsic::x86_seh_lsda: {
21186 // Compute the symbol for the LSDA. We know it'll get emitted later.
21187 MachineFunction &MF = DAG.getMachineFunction();
21188 SDValue Op1 = Op.getOperand(1);
21189 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21190 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21191 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21193 // Generate a simple absolute symbol reference. This intrinsic is only
21194 // supported on 32-bit Windows, which isn't PIC.
21195 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21196 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21199 case Intrinsic::x86_seh_recoverfp: {
21200 SDValue FnOp = Op.getOperand(1);
21201 SDValue IncomingFPOp = Op.getOperand(2);
21202 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21203 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21205 report_fatal_error(
21206 "llvm.x86.seh.recoverfp must take a function as the first argument");
21207 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21210 case Intrinsic::localaddress: {
21211 // Returns one of the stack, base, or frame pointer registers, depending on
21212 // which is used to reference local variables.
21213 MachineFunction &MF = DAG.getMachineFunction();
21214 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21216 if (RegInfo->hasBasePointer(MF))
21217 Reg = RegInfo->getBaseRegister();
21218 else // This function handles the SP or FP case.
21219 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21220 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21225 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21226 SDValue Src, SDValue Mask, SDValue Base,
21227 SDValue Index, SDValue ScaleOp, SDValue Chain,
21228 const X86Subtarget &Subtarget) {
21230 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21231 // Scale must be constant.
21234 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21235 EVT MaskVT = Mask.getValueType();
21236 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21237 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21238 SDValue Segment = DAG.getRegister(0, MVT::i32);
21239 // If source is undef or we know it won't be used, use a zero vector
21240 // to break register dependency.
21241 // TODO: use undef instead and let BreakFalseDeps deal with it?
21242 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21243 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21244 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21245 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21246 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21247 return DAG.getMergeValues(RetOps, dl);
21250 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21251 SDValue Src, SDValue Mask, SDValue Base,
21252 SDValue Index, SDValue ScaleOp, SDValue Chain,
21253 const X86Subtarget &Subtarget) {
21255 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21256 // Scale must be constant.
21259 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21260 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21261 Index.getSimpleValueType().getVectorNumElements());
21263 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21264 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21265 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21266 SDValue Segment = DAG.getRegister(0, MVT::i32);
21267 // If source is undef or we know it won't be used, use a zero vector
21268 // to break register dependency.
21269 // TODO: use undef instead and let BreakFalseDeps deal with it?
21270 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21271 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21272 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21273 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21274 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21275 return DAG.getMergeValues(RetOps, dl);
21278 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21279 SDValue Src, SDValue Mask, SDValue Base,
21280 SDValue Index, SDValue ScaleOp, SDValue Chain,
21281 const X86Subtarget &Subtarget) {
21283 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21284 // Scale must be constant.
21287 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21288 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21289 SDValue Segment = DAG.getRegister(0, MVT::i32);
21290 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21291 Index.getSimpleValueType().getVectorNumElements());
21293 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21294 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21295 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21296 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21297 return SDValue(Res, 1);
21300 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21301 SDValue Mask, SDValue Base, SDValue Index,
21302 SDValue ScaleOp, SDValue Chain,
21303 const X86Subtarget &Subtarget) {
21305 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21306 // Scale must be constant.
21309 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21310 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21311 SDValue Segment = DAG.getRegister(0, MVT::i32);
21313 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21314 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21315 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21316 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21317 return SDValue(Res, 0);
21320 /// Handles the lowering of builtin intrinsic that return the value
21321 /// of the extended control register.
21322 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21324 const X86Subtarget &Subtarget,
21325 SmallVectorImpl<SDValue> &Results) {
21326 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21327 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21330 // The ECX register is used to select the index of the XCR register to
21333 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21334 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21335 Chain = SDValue(N1, 0);
21337 // Reads the content of XCR and returns it in registers EDX:EAX.
21338 if (Subtarget.is64Bit()) {
21339 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21340 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21343 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21344 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21347 Chain = HI.getValue(1);
21349 if (Subtarget.is64Bit()) {
21350 // Merge the two 32-bit values into a 64-bit one..
21351 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21352 DAG.getConstant(32, DL, MVT::i8));
21353 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21354 Results.push_back(Chain);
21358 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21359 SDValue Ops[] = { LO, HI };
21360 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21361 Results.push_back(Pair);
21362 Results.push_back(Chain);
21365 /// Handles the lowering of builtin intrinsics that read performance monitor
21366 /// counters (x86_rdpmc).
21367 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21369 const X86Subtarget &Subtarget,
21370 SmallVectorImpl<SDValue> &Results) {
21371 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21372 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21375 // The ECX register is used to select the index of the performance counter
21377 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21379 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21381 // Reads the content of a 64-bit performance counter and returns it in the
21382 // registers EDX:EAX.
21383 if (Subtarget.is64Bit()) {
21384 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21385 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21388 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21389 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21392 Chain = HI.getValue(1);
21394 if (Subtarget.is64Bit()) {
21395 // The EAX register is loaded with the low-order 32 bits. The EDX register
21396 // is loaded with the supported high-order bits of the counter.
21397 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21398 DAG.getConstant(32, DL, MVT::i8));
21399 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21400 Results.push_back(Chain);
21404 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21405 SDValue Ops[] = { LO, HI };
21406 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21407 Results.push_back(Pair);
21408 Results.push_back(Chain);
21411 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21412 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21413 /// READCYCLECOUNTER nodes.
21414 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21416 const X86Subtarget &Subtarget,
21417 SmallVectorImpl<SDValue> &Results) {
21418 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21419 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21422 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21423 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21424 // and the EAX register is loaded with the low-order 32 bits.
21425 if (Subtarget.is64Bit()) {
21426 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21427 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21430 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21431 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21434 SDValue Chain = HI.getValue(1);
21436 if (Opcode == X86ISD::RDTSCP_DAG) {
21437 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21439 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21440 // the ECX register. Add 'ecx' explicitly to the chain.
21441 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21443 // Explicitly store the content of ECX at the location passed in input
21444 // to the 'rdtscp' intrinsic.
21445 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21446 MachinePointerInfo());
21449 if (Subtarget.is64Bit()) {
21450 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21451 // the EAX register is loaded with the low-order 32 bits.
21452 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21453 DAG.getConstant(32, DL, MVT::i8));
21454 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21455 Results.push_back(Chain);
21459 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21460 SDValue Ops[] = { LO, HI };
21461 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21462 Results.push_back(Pair);
21463 Results.push_back(Chain);
21466 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21467 SelectionDAG &DAG) {
21468 SmallVector<SDValue, 2> Results;
21470 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21472 return DAG.getMergeValues(Results, DL);
21475 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21476 MachineFunction &MF = DAG.getMachineFunction();
21477 SDValue Chain = Op.getOperand(0);
21478 SDValue RegNode = Op.getOperand(2);
21479 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21481 report_fatal_error("EH registrations only live in functions using WinEH");
21483 // Cast the operand to an alloca, and remember the frame index.
21484 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21486 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21487 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21489 // Return the chain operand without making any DAG nodes.
21493 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21494 MachineFunction &MF = DAG.getMachineFunction();
21495 SDValue Chain = Op.getOperand(0);
21496 SDValue EHGuard = Op.getOperand(2);
21497 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21499 report_fatal_error("EHGuard only live in functions using WinEH");
21501 // Cast the operand to an alloca, and remember the frame index.
21502 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21504 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21505 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21507 // Return the chain operand without making any DAG nodes.
21511 /// Emit Truncating Store with signed or unsigned saturation.
21513 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21514 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21515 SelectionDAG &DAG) {
21517 SDVTList VTs = DAG.getVTList(MVT::Other);
21518 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21519 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21521 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21522 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21525 /// Emit Masked Truncating Store with signed or unsigned saturation.
21527 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21528 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21529 MachineMemOperand *MMO, SelectionDAG &DAG) {
21531 SDVTList VTs = DAG.getVTList(MVT::Other);
21532 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21534 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21535 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21538 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21539 SelectionDAG &DAG) {
21540 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21542 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21545 case llvm::Intrinsic::x86_seh_ehregnode:
21546 return MarkEHRegistrationNode(Op, DAG);
21547 case llvm::Intrinsic::x86_seh_ehguard:
21548 return MarkEHGuard(Op, DAG);
21549 case llvm::Intrinsic::x86_flags_read_u32:
21550 case llvm::Intrinsic::x86_flags_read_u64:
21551 case llvm::Intrinsic::x86_flags_write_u32:
21552 case llvm::Intrinsic::x86_flags_write_u64: {
21553 // We need a frame pointer because this will get lowered to a PUSH/POP
21555 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21556 MFI.setHasCopyImplyingStackAdjustment(true);
21557 // Don't do anything here, we will expand these intrinsics out later
21558 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21561 case Intrinsic::x86_lwpins32:
21562 case Intrinsic::x86_lwpins64:
21563 case Intrinsic::x86_umwait:
21564 case Intrinsic::x86_tpause: {
21566 SDValue Chain = Op->getOperand(0);
21567 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21571 default: llvm_unreachable("Impossible intrinsic");
21572 case Intrinsic::x86_umwait:
21573 Opcode = X86ISD::UMWAIT;
21575 case Intrinsic::x86_tpause:
21576 Opcode = X86ISD::TPAUSE;
21578 case Intrinsic::x86_lwpins32:
21579 case Intrinsic::x86_lwpins64:
21580 Opcode = X86ISD::LWPINS;
21584 SDValue Operation =
21585 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
21586 Op->getOperand(3), Op->getOperand(4));
21587 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
21588 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21589 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21590 Operation.getValue(1));
21597 switch(IntrData->Type) {
21598 default: llvm_unreachable("Unknown Intrinsic Type");
21601 // Emit the node with the right value type.
21602 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21603 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21605 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21606 // Otherwise return the value from Rand, which is always 0, casted to i32.
21607 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21608 DAG.getConstant(1, dl, Op->getValueType(1)),
21609 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21610 SDValue(Result.getNode(), 1) };
21611 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21613 // Return { result, isValid, chain }.
21614 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21615 SDValue(Result.getNode(), 2));
21617 case GATHER_AVX2: {
21618 SDValue Chain = Op.getOperand(0);
21619 SDValue Src = Op.getOperand(2);
21620 SDValue Base = Op.getOperand(3);
21621 SDValue Index = Op.getOperand(4);
21622 SDValue Mask = Op.getOperand(5);
21623 SDValue Scale = Op.getOperand(6);
21624 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21625 Scale, Chain, Subtarget);
21628 //gather(v1, mask, index, base, scale);
21629 SDValue Chain = Op.getOperand(0);
21630 SDValue Src = Op.getOperand(2);
21631 SDValue Base = Op.getOperand(3);
21632 SDValue Index = Op.getOperand(4);
21633 SDValue Mask = Op.getOperand(5);
21634 SDValue Scale = Op.getOperand(6);
21635 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21639 //scatter(base, mask, index, v1, scale);
21640 SDValue Chain = Op.getOperand(0);
21641 SDValue Base = Op.getOperand(2);
21642 SDValue Mask = Op.getOperand(3);
21643 SDValue Index = Op.getOperand(4);
21644 SDValue Src = Op.getOperand(5);
21645 SDValue Scale = Op.getOperand(6);
21646 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21647 Scale, Chain, Subtarget);
21650 SDValue Hint = Op.getOperand(6);
21651 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21652 assert((HintVal == 2 || HintVal == 3) &&
21653 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21654 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21655 SDValue Chain = Op.getOperand(0);
21656 SDValue Mask = Op.getOperand(2);
21657 SDValue Index = Op.getOperand(3);
21658 SDValue Base = Op.getOperand(4);
21659 SDValue Scale = Op.getOperand(5);
21660 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21663 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21665 SmallVector<SDValue, 2> Results;
21666 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21668 return DAG.getMergeValues(Results, dl);
21670 // Read Performance Monitoring Counters.
21672 SmallVector<SDValue, 2> Results;
21673 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21674 return DAG.getMergeValues(Results, dl);
21676 // Get Extended Control Register.
21678 SmallVector<SDValue, 2> Results;
21679 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21680 return DAG.getMergeValues(Results, dl);
21682 // XTEST intrinsics.
21684 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21685 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21687 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21688 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21689 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21690 Ret, SDValue(InTrans.getNode(), 1));
21694 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21695 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21696 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21697 DAG.getConstant(-1, dl, MVT::i8));
21698 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21699 Op.getOperand(4), GenCF.getValue(1));
21700 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21701 Op.getOperand(5), MachinePointerInfo());
21702 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21703 SDValue Results[] = { SetCC, Store };
21704 return DAG.getMergeValues(Results, dl);
21706 case TRUNCATE_TO_MEM_VI8:
21707 case TRUNCATE_TO_MEM_VI16:
21708 case TRUNCATE_TO_MEM_VI32: {
21709 SDValue Mask = Op.getOperand(4);
21710 SDValue DataToTruncate = Op.getOperand(3);
21711 SDValue Addr = Op.getOperand(2);
21712 SDValue Chain = Op.getOperand(0);
21714 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21715 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21717 EVT MemVT = MemIntr->getMemoryVT();
21719 uint16_t TruncationOp = IntrData->Opc0;
21720 switch (TruncationOp) {
21721 case X86ISD::VTRUNC: {
21722 if (isAllOnesConstant(Mask)) // return just a truncate store
21723 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21724 MemIntr->getMemOperand());
21726 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21727 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21729 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21730 MemIntr->getMemOperand(), true /* truncating */);
21732 case X86ISD::VTRUNCUS:
21733 case X86ISD::VTRUNCS: {
21734 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21735 if (isAllOnesConstant(Mask))
21736 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21737 MemIntr->getMemOperand(), DAG);
21739 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21740 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21742 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21743 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21746 llvm_unreachable("Unsupported truncstore intrinsic");
21752 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21753 SelectionDAG &DAG) const {
21754 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21755 MFI.setReturnAddressIsTaken(true);
21757 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21760 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21762 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21765 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21766 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21767 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21768 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21769 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21770 MachinePointerInfo());
21773 // Just load the return address.
21774 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21775 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21776 MachinePointerInfo());
21779 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21780 SelectionDAG &DAG) const {
21781 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21782 return getReturnAddressFrameIndex(DAG);
21785 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21786 MachineFunction &MF = DAG.getMachineFunction();
21787 MachineFrameInfo &MFI = MF.getFrameInfo();
21788 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21789 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21790 EVT VT = Op.getValueType();
21792 MFI.setFrameAddressIsTaken(true);
21794 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21795 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21796 // is not possible to crawl up the stack without looking at the unwind codes
21798 int FrameAddrIndex = FuncInfo->getFAIndex();
21799 if (!FrameAddrIndex) {
21800 // Set up a frame object for the return address.
21801 unsigned SlotSize = RegInfo->getSlotSize();
21802 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21803 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21804 FuncInfo->setFAIndex(FrameAddrIndex);
21806 return DAG.getFrameIndex(FrameAddrIndex, VT);
21809 unsigned FrameReg =
21810 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21811 SDLoc dl(Op); // FIXME probably not meaningful
21812 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21813 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21814 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21815 "Invalid Frame Register!");
21816 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21818 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21819 MachinePointerInfo());
21823 // FIXME? Maybe this could be a TableGen attribute on some registers and
21824 // this table could be generated automatically from RegInfo.
21825 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21826 SelectionDAG &DAG) const {
21827 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21828 const MachineFunction &MF = DAG.getMachineFunction();
21830 unsigned Reg = StringSwitch<unsigned>(RegName)
21831 .Case("esp", X86::ESP)
21832 .Case("rsp", X86::RSP)
21833 .Case("ebp", X86::EBP)
21834 .Case("rbp", X86::RBP)
21837 if (Reg == X86::EBP || Reg == X86::RBP) {
21838 if (!TFI.hasFP(MF))
21839 report_fatal_error("register " + StringRef(RegName) +
21840 " is allocatable: function has no frame pointer");
21843 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21844 unsigned FrameReg =
21845 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21846 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21847 "Invalid Frame Register!");
21855 report_fatal_error("Invalid register name global variable");
21858 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21859 SelectionDAG &DAG) const {
21860 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21861 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21864 unsigned X86TargetLowering::getExceptionPointerRegister(
21865 const Constant *PersonalityFn) const {
21866 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21867 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21869 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21872 unsigned X86TargetLowering::getExceptionSelectorRegister(
21873 const Constant *PersonalityFn) const {
21874 // Funclet personalities don't use selectors (the runtime does the selection).
21875 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21876 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21879 bool X86TargetLowering::needsFixedCatchObjects() const {
21880 return Subtarget.isTargetWin64();
21883 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21884 SDValue Chain = Op.getOperand(0);
21885 SDValue Offset = Op.getOperand(1);
21886 SDValue Handler = Op.getOperand(2);
21889 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21890 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21891 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21892 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21893 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21894 "Invalid Frame Register!");
21895 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21896 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21898 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21899 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21901 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21902 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21903 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21905 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21906 DAG.getRegister(StoreAddrReg, PtrVT));
21909 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21910 SelectionDAG &DAG) const {
21912 // If the subtarget is not 64bit, we may need the global base reg
21913 // after isel expand pseudo, i.e., after CGBR pass ran.
21914 // Therefore, ask for the GlobalBaseReg now, so that the pass
21915 // inserts the code for us in case we need it.
21916 // Otherwise, we will end up in a situation where we will
21917 // reference a virtual register that is not defined!
21918 if (!Subtarget.is64Bit()) {
21919 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21920 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21922 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21923 DAG.getVTList(MVT::i32, MVT::Other),
21924 Op.getOperand(0), Op.getOperand(1));
21927 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21928 SelectionDAG &DAG) const {
21930 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21931 Op.getOperand(0), Op.getOperand(1));
21934 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21935 SelectionDAG &DAG) const {
21937 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21941 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21942 return Op.getOperand(0);
21945 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21946 SelectionDAG &DAG) const {
21947 SDValue Root = Op.getOperand(0);
21948 SDValue Trmp = Op.getOperand(1); // trampoline
21949 SDValue FPtr = Op.getOperand(2); // nested function
21950 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21953 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21954 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21956 if (Subtarget.is64Bit()) {
21957 SDValue OutChains[6];
21959 // Large code-model.
21960 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21961 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21963 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21964 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21966 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21968 // Load the pointer to the nested function into R11.
21969 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21970 SDValue Addr = Trmp;
21971 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21972 Addr, MachinePointerInfo(TrmpAddr));
21974 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21975 DAG.getConstant(2, dl, MVT::i64));
21977 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21978 /* Alignment = */ 2);
21980 // Load the 'nest' parameter value into R10.
21981 // R10 is specified in X86CallingConv.td
21982 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21983 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21984 DAG.getConstant(10, dl, MVT::i64));
21985 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21986 Addr, MachinePointerInfo(TrmpAddr, 10));
21988 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21989 DAG.getConstant(12, dl, MVT::i64));
21991 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21992 /* Alignment = */ 2);
21994 // Jump to the nested function.
21995 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21996 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21997 DAG.getConstant(20, dl, MVT::i64));
21998 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21999 Addr, MachinePointerInfo(TrmpAddr, 20));
22001 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
22002 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
22003 DAG.getConstant(22, dl, MVT::i64));
22004 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
22005 Addr, MachinePointerInfo(TrmpAddr, 22));
22007 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22009 const Function *Func =
22010 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
22011 CallingConv::ID CC = Func->getCallingConv();
22016 llvm_unreachable("Unsupported calling convention");
22017 case CallingConv::C:
22018 case CallingConv::X86_StdCall: {
22019 // Pass 'nest' parameter in ECX.
22020 // Must be kept in sync with X86CallingConv.td
22021 NestReg = X86::ECX;
22023 // Check that ECX wasn't needed by an 'inreg' parameter.
22024 FunctionType *FTy = Func->getFunctionType();
22025 const AttributeList &Attrs = Func->getAttributes();
22027 if (!Attrs.isEmpty() && !Func->isVarArg()) {
22028 unsigned InRegCount = 0;
22031 for (FunctionType::param_iterator I = FTy->param_begin(),
22032 E = FTy->param_end(); I != E; ++I, ++Idx)
22033 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
22034 auto &DL = DAG.getDataLayout();
22035 // FIXME: should only count parameters that are lowered to integers.
22036 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
22039 if (InRegCount > 2) {
22040 report_fatal_error("Nest register in use - reduce number of inreg"
22046 case CallingConv::X86_FastCall:
22047 case CallingConv::X86_ThisCall:
22048 case CallingConv::Fast:
22049 // Pass 'nest' parameter in EAX.
22050 // Must be kept in sync with X86CallingConv.td
22051 NestReg = X86::EAX;
22055 SDValue OutChains[4];
22056 SDValue Addr, Disp;
22058 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22059 DAG.getConstant(10, dl, MVT::i32));
22060 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
22062 // This is storing the opcode for MOV32ri.
22063 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
22064 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
22066 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
22067 Trmp, MachinePointerInfo(TrmpAddr));
22069 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22070 DAG.getConstant(1, dl, MVT::i32));
22072 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
22073 /* Alignment = */ 1);
22075 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
22076 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22077 DAG.getConstant(5, dl, MVT::i32));
22078 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
22079 Addr, MachinePointerInfo(TrmpAddr, 5),
22080 /* Alignment = */ 1);
22082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22083 DAG.getConstant(6, dl, MVT::i32));
22085 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
22086 /* Alignment = */ 1);
22088 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22092 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
22093 SelectionDAG &DAG) const {
22095 The rounding mode is in bits 11:10 of FPSR, and has the following
22097 00 Round to nearest
22102 FLT_ROUNDS, on the other hand, expects the following:
22109 To perform the conversion, we do:
22110 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
22113 MachineFunction &MF = DAG.getMachineFunction();
22114 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22115 unsigned StackAlignment = TFI.getStackAlignment();
22116 MVT VT = Op.getSimpleValueType();
22119 // Save FP Control Word to stack slot
22120 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
22121 SDValue StackSlot =
22122 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
22124 MachineMemOperand *MMO =
22125 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
22126 MachineMemOperand::MOStore, 2, 2);
22128 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
22129 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
22130 DAG.getVTList(MVT::Other),
22131 Ops, MVT::i16, MMO);
22133 // Load FP Control Word from stack slot
22135 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
22137 // Transform as necessary
22139 DAG.getNode(ISD::SRL, DL, MVT::i16,
22140 DAG.getNode(ISD::AND, DL, MVT::i16,
22141 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
22142 DAG.getConstant(11, DL, MVT::i8));
22144 DAG.getNode(ISD::SRL, DL, MVT::i16,
22145 DAG.getNode(ISD::AND, DL, MVT::i16,
22146 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
22147 DAG.getConstant(9, DL, MVT::i8));
22150 DAG.getNode(ISD::AND, DL, MVT::i16,
22151 DAG.getNode(ISD::ADD, DL, MVT::i16,
22152 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22153 DAG.getConstant(1, DL, MVT::i16)),
22154 DAG.getConstant(3, DL, MVT::i16));
22156 return DAG.getNode((VT.getSizeInBits() < 16 ?
22157 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22160 // Split an unary integer op into 2 half sized ops.
22161 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22162 MVT VT = Op.getSimpleValueType();
22163 unsigned NumElems = VT.getVectorNumElements();
22164 unsigned SizeInBits = VT.getSizeInBits();
22165 MVT EltVT = VT.getVectorElementType();
22166 SDValue Src = Op.getOperand(0);
22167 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
22168 "Src and Op should have the same element type!");
22170 // Extract the Lo/Hi vectors
22172 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22173 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
22175 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22176 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22177 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22178 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22181 // Decompose 256-bit ops into smaller 128-bit ops.
22182 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22183 assert(Op.getSimpleValueType().is256BitVector() &&
22184 Op.getSimpleValueType().isInteger() &&
22185 "Only handle AVX 256-bit vector integer operation");
22186 return LowerVectorIntUnary(Op, DAG);
22189 // Decompose 512-bit ops into smaller 256-bit ops.
22190 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22191 assert(Op.getSimpleValueType().is512BitVector() &&
22192 Op.getSimpleValueType().isInteger() &&
22193 "Only handle AVX 512-bit vector integer operation");
22194 return LowerVectorIntUnary(Op, DAG);
22197 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
22199 // i8/i16 vector implemented using dword LZCNT vector instruction
22200 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22201 // split the vector, perform operation on it's Lo a Hi part and
22202 // concatenate the results.
22203 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22204 const X86Subtarget &Subtarget) {
22205 assert(Op.getOpcode() == ISD::CTLZ);
22207 MVT VT = Op.getSimpleValueType();
22208 MVT EltVT = VT.getVectorElementType();
22209 unsigned NumElems = VT.getVectorNumElements();
22211 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22212 "Unsupported element type");
22214 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22215 if (NumElems > 16 ||
22216 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22217 return LowerVectorIntUnary(Op, DAG);
22219 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22220 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22221 "Unsupported value type for operation");
22223 // Use native supported vector instruction vplzcntd.
22224 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22225 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22226 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22227 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22229 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22232 // Lower CTLZ using a PSHUFB lookup table implementation.
22233 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22234 const X86Subtarget &Subtarget,
22235 SelectionDAG &DAG) {
22236 MVT VT = Op.getSimpleValueType();
22237 int NumElts = VT.getVectorNumElements();
22238 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22239 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22241 // Per-nibble leading zero PSHUFB lookup table.
22242 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22243 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22244 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22245 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22247 SmallVector<SDValue, 64> LUTVec;
22248 for (int i = 0; i < NumBytes; ++i)
22249 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22250 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22252 // Begin by bitcasting the input to byte vector, then split those bytes
22253 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22254 // If the hi input nibble is zero then we add both results together, otherwise
22255 // we just take the hi result (by masking the lo result to zero before the
22257 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22258 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22260 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22261 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22262 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22263 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22265 if (CurrVT.is512BitVector()) {
22266 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22267 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22268 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22270 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22273 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22274 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22275 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22276 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22278 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22279 // of the current vector width in the same way we did for the nibbles.
22280 // If the upper half of the input element is zero then add the halves'
22281 // leading zero counts together, otherwise just use the upper half's.
22282 // Double the width of the result until we are at target width.
22283 while (CurrVT != VT) {
22284 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22285 int CurrNumElts = CurrVT.getVectorNumElements();
22286 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22287 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22288 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22290 // Check if the upper half of the input element is zero.
22291 if (CurrVT.is512BitVector()) {
22292 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22293 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22294 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22295 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22297 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22298 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22300 HiZ = DAG.getBitcast(NextVT, HiZ);
22302 // Move the upper/lower halves to the lower bits as we'll be extending to
22303 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22305 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22306 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22307 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22308 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22309 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22316 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22317 const X86Subtarget &Subtarget,
22318 SelectionDAG &DAG) {
22319 MVT VT = Op.getSimpleValueType();
22321 if (Subtarget.hasCDI() &&
22322 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22323 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22324 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22326 // Decompose 256-bit ops into smaller 128-bit ops.
22327 if (VT.is256BitVector() && !Subtarget.hasInt256())
22328 return Lower256IntUnary(Op, DAG);
22330 // Decompose 512-bit ops into smaller 256-bit ops.
22331 if (VT.is512BitVector() && !Subtarget.hasBWI())
22332 return Lower512IntUnary(Op, DAG);
22334 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22335 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22338 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22339 SelectionDAG &DAG) {
22340 MVT VT = Op.getSimpleValueType();
22342 unsigned NumBits = VT.getSizeInBits();
22344 unsigned Opc = Op.getOpcode();
22347 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22349 Op = Op.getOperand(0);
22350 if (VT == MVT::i8) {
22351 // Zero extend to i32 since there is not an i8 bsr.
22353 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22356 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22357 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22358 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22360 if (Opc == ISD::CTLZ) {
22361 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22364 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22365 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22368 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22371 // Finally xor with NumBits-1.
22372 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22373 DAG.getConstant(NumBits - 1, dl, OpVT));
22376 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22380 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22381 MVT VT = Op.getSimpleValueType();
22382 unsigned NumBits = VT.getScalarSizeInBits();
22385 if (VT.isVector()) {
22386 SDValue N0 = Op.getOperand(0);
22387 SDValue Zero = DAG.getConstant(0, dl, VT);
22389 // lsb(x) = (x & -x)
22390 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22391 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22393 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22394 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22395 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22396 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22397 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22400 // cttz(x) = ctpop(lsb - 1)
22401 SDValue One = DAG.getConstant(1, dl, VT);
22402 return DAG.getNode(ISD::CTPOP, dl, VT,
22403 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22406 assert(Op.getOpcode() == ISD::CTTZ &&
22407 "Only scalar CTTZ requires custom lowering");
22409 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22410 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22411 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22413 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22416 DAG.getConstant(NumBits, dl, VT),
22417 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22420 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22423 /// Break a 256-bit integer operation into two new 128-bit ones and then
22424 /// concatenate the result back.
22425 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22426 MVT VT = Op.getSimpleValueType();
22428 assert(VT.is256BitVector() && VT.isInteger() &&
22429 "Unsupported value type for operation");
22431 unsigned NumElems = VT.getVectorNumElements();
22434 // Extract the LHS vectors
22435 SDValue LHS = Op.getOperand(0);
22436 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22437 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22439 // Extract the RHS vectors
22440 SDValue RHS = Op.getOperand(1);
22441 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22442 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22444 MVT EltVT = VT.getVectorElementType();
22445 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22447 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22448 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22449 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22452 /// Break a 512-bit integer operation into two new 256-bit ones and then
22453 /// concatenate the result back.
22454 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22455 MVT VT = Op.getSimpleValueType();
22457 assert(VT.is512BitVector() && VT.isInteger() &&
22458 "Unsupported value type for operation");
22460 unsigned NumElems = VT.getVectorNumElements();
22463 // Extract the LHS vectors
22464 SDValue LHS = Op.getOperand(0);
22465 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22466 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22468 // Extract the RHS vectors
22469 SDValue RHS = Op.getOperand(1);
22470 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22471 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22473 MVT EltVT = VT.getVectorElementType();
22474 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22476 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22477 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22478 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22481 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22482 MVT VT = Op.getSimpleValueType();
22483 if (VT.getScalarType() == MVT::i1)
22484 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22485 Op.getOperand(0), Op.getOperand(1));
22486 assert(Op.getSimpleValueType().is256BitVector() &&
22487 Op.getSimpleValueType().isInteger() &&
22488 "Only handle AVX 256-bit vector integer operation");
22489 return Lower256IntArith(Op, DAG);
22492 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22493 MVT VT = Op.getSimpleValueType();
22494 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22495 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22496 // 8-bit integer abs to NEG and CMOV.
22498 SDValue N0 = Op.getOperand(0);
22499 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22500 DAG.getConstant(0, DL, VT), N0);
22501 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22502 SDValue(Neg.getNode(), 1)};
22503 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22506 assert(Op.getSimpleValueType().is256BitVector() &&
22507 Op.getSimpleValueType().isInteger() &&
22508 "Only handle AVX 256-bit vector integer operation");
22509 return Lower256IntUnary(Op, DAG);
22512 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22513 MVT VT = Op.getSimpleValueType();
22515 // For AVX1 cases, split to use legal ops (everything but v4i64).
22516 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22517 return Lower256IntArith(Op, DAG);
22520 unsigned Opcode = Op.getOpcode();
22521 SDValue N0 = Op.getOperand(0);
22522 SDValue N1 = Op.getOperand(1);
22524 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22525 // using the SMIN/SMAX instructions and flipping the signbit back.
22526 if (VT == MVT::v8i16) {
22527 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22528 "Unexpected MIN/MAX opcode");
22529 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22530 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22531 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22532 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22533 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22534 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22537 // Else, expand to a compare/select.
22540 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22541 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22542 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22543 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22544 default: llvm_unreachable("Unknown MINMAX opcode");
22547 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22548 return DAG.getSelect(DL, VT, Cond, N0, N1);
22551 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22552 SelectionDAG &DAG) {
22554 MVT VT = Op.getSimpleValueType();
22556 if (VT.getScalarType() == MVT::i1)
22557 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22559 // Decompose 256-bit ops into smaller 128-bit ops.
22560 if (VT.is256BitVector() && !Subtarget.hasInt256())
22561 return Lower256IntArith(Op, DAG);
22563 SDValue A = Op.getOperand(0);
22564 SDValue B = Op.getOperand(1);
22566 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22567 // vector pairs, multiply and truncate.
22568 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22569 if (Subtarget.hasInt256()) {
22570 // For 512-bit vectors, split into 256-bit vectors to allow the
22571 // sign-extension to occur.
22572 if (VT == MVT::v64i8)
22573 return Lower512IntArith(Op, DAG);
22575 // For 256-bit vectors, split into 128-bit vectors to allow the
22576 // sign-extension to occur. We don't need this on AVX512BW as we can
22577 // safely sign-extend to v32i16.
22578 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22579 return Lower256IntArith(Op, DAG);
22581 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22582 return DAG.getNode(
22583 ISD::TRUNCATE, dl, VT,
22584 DAG.getNode(ISD::MUL, dl, ExVT,
22585 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22586 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22589 assert(VT == MVT::v16i8 &&
22590 "Pre-AVX2 support only supports v16i8 multiplication");
22591 MVT ExVT = MVT::v8i16;
22593 // Extract the lo parts and sign extend to i16
22594 // We're going to mask off the low byte of each result element of the
22595 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22597 const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22598 4, -1, 5, -1, 6, -1, 7, -1};
22599 SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
22600 SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
22601 ALo = DAG.getBitcast(ExVT, ALo);
22602 BLo = DAG.getBitcast(ExVT, BLo);
22604 // Extract the hi parts and sign extend to i16
22605 // We're going to mask off the low byte of each result element of the
22606 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22608 const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22609 12, -1, 13, -1, 14, -1, 15, -1};
22610 SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
22611 SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
22612 AHi = DAG.getBitcast(ExVT, AHi);
22613 BHi = DAG.getBitcast(ExVT, BHi);
22615 // Multiply, mask the lower 8bits of the lo/hi results and pack
22616 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22617 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22618 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22619 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22620 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22623 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22624 if (VT == MVT::v4i32) {
22625 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22626 "Should not custom lower when pmulld is available!");
22628 // Extract the odd parts.
22629 static const int UnpackMask[] = { 1, -1, 3, -1 };
22630 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22631 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22633 // Multiply the even parts.
22634 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22635 DAG.getBitcast(MVT::v2i64, A),
22636 DAG.getBitcast(MVT::v2i64, B));
22637 // Now multiply odd parts.
22638 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22639 DAG.getBitcast(MVT::v2i64, Aodds),
22640 DAG.getBitcast(MVT::v2i64, Bodds));
22642 Evens = DAG.getBitcast(VT, Evens);
22643 Odds = DAG.getBitcast(VT, Odds);
22645 // Merge the two vectors back together with a shuffle. This expands into 2
22647 static const int ShufMask[] = { 0, 4, 2, 6 };
22648 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22651 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22652 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22653 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
22655 // Ahi = psrlqi(a, 32);
22656 // Bhi = psrlqi(b, 32);
22658 // AloBlo = pmuludq(a, b);
22659 // AloBhi = pmuludq(a, Bhi);
22660 // AhiBlo = pmuludq(Ahi, b);
22662 // Hi = psllqi(AloBhi + AhiBlo, 32);
22663 // return AloBlo + Hi;
22664 KnownBits AKnown, BKnown;
22665 DAG.computeKnownBits(A, AKnown);
22666 DAG.computeKnownBits(B, BKnown);
22668 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22669 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22670 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22672 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22673 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22674 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22676 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22678 // Only multiply lo/hi halves that aren't known to be zero.
22679 SDValue AloBlo = Zero;
22680 if (!ALoIsZero && !BLoIsZero)
22681 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22683 SDValue AloBhi = Zero;
22684 if (!ALoIsZero && !BHiIsZero) {
22685 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22686 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22689 SDValue AhiBlo = Zero;
22690 if (!AHiIsZero && !BLoIsZero) {
22691 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22692 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22695 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22696 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22698 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22701 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22702 SelectionDAG &DAG) {
22704 MVT VT = Op.getSimpleValueType();
22706 // Decompose 256-bit ops into smaller 128-bit ops.
22707 if (VT.is256BitVector() && !Subtarget.hasInt256())
22708 return Lower256IntArith(Op, DAG);
22710 // Only i8 vectors should need custom lowering after this.
22711 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22712 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22713 "Unsupported vector type");
22715 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22716 // logical shift down the upper half and pack back to i8.
22717 SDValue A = Op.getOperand(0);
22718 SDValue B = Op.getOperand(1);
22720 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22721 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22722 unsigned Opcode = Op.getOpcode();
22723 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22724 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22726 // For 512-bit vectors, split into 256-bit vectors to allow the
22727 // sign-extension to occur.
22728 if (VT == MVT::v64i8)
22729 return Lower512IntArith(Op, DAG);
22731 // AVX2 implementations - extend xmm subvectors to ymm.
22732 if (Subtarget.hasInt256()) {
22733 unsigned NumElems = VT.getVectorNumElements();
22734 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22735 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22737 if (VT == MVT::v32i8) {
22738 if (Subtarget.canExtendTo512BW()) {
22739 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22740 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22741 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22742 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22743 DAG.getConstant(8, dl, MVT::v32i16));
22744 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22746 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22747 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22748 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22749 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22750 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22751 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22752 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22753 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22754 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22755 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22756 DAG.getConstant(8, dl, MVT::v16i16));
22757 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22758 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22759 DAG.getConstant(8, dl, MVT::v16i16));
22760 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22761 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22762 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22763 16, 17, 18, 19, 20, 21, 22, 23};
22764 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22765 24, 25, 26, 27, 28, 29, 30, 31};
22766 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22767 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22768 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22771 assert(VT == MVT::v16i8 && "Unexpected VT");
22773 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22774 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22775 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22776 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22777 DAG.getConstant(8, dl, MVT::v16i16));
22778 // If we have BWI we can use truncate instruction.
22779 if (Subtarget.hasBWI())
22780 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22781 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22782 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22783 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22786 assert(VT == MVT::v16i8 &&
22787 "Pre-AVX2 support only supports v16i8 multiplication");
22788 MVT ExVT = MVT::v8i16;
22789 unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
22790 : ISD::SIGN_EXTEND_VECTOR_INREG;
22792 // Extract the lo parts and zero/sign extend to i16.
22794 if (Subtarget.hasSSE41()) {
22795 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
22796 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
22798 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22799 -1, 4, -1, 5, -1, 6, -1, 7};
22800 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22801 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22802 ALo = DAG.getBitcast(ExVT, ALo);
22803 BLo = DAG.getBitcast(ExVT, BLo);
22804 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22805 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22808 // Extract the hi parts and zero/sign extend to i16.
22810 if (Subtarget.hasSSE41()) {
22811 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22812 -1, -1, -1, -1, -1, -1, -1, -1};
22813 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22814 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22815 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
22816 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
22818 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22819 -1, 12, -1, 13, -1, 14, -1, 15};
22820 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22821 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22822 AHi = DAG.getBitcast(ExVT, AHi);
22823 BHi = DAG.getBitcast(ExVT, BHi);
22824 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22825 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22828 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22829 // pack back to v16i8.
22830 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22831 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22832 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22833 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22834 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22837 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22838 assert(Subtarget.isTargetWin64() && "Unexpected target");
22839 EVT VT = Op.getValueType();
22840 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22841 "Unexpected return type for lowering");
22845 switch (Op->getOpcode()) {
22846 default: llvm_unreachable("Unexpected request for libcall!");
22847 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22848 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22849 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22850 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22851 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22852 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22856 SDValue InChain = DAG.getEntryNode();
22858 TargetLowering::ArgListTy Args;
22859 TargetLowering::ArgListEntry Entry;
22860 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22861 EVT ArgVT = Op->getOperand(i).getValueType();
22862 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22863 "Unexpected argument type for lowering");
22864 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22865 Entry.Node = StackPtr;
22866 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22867 MachinePointerInfo(), /* Alignment = */ 16);
22868 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22869 Entry.Ty = PointerType::get(ArgTy,0);
22870 Entry.IsSExt = false;
22871 Entry.IsZExt = false;
22872 Args.push_back(Entry);
22875 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22876 getPointerTy(DAG.getDataLayout()));
22878 TargetLowering::CallLoweringInfo CLI(DAG);
22879 CLI.setDebugLoc(dl)
22882 getLibcallCallingConv(LC),
22883 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22886 .setSExtResult(isSigned)
22887 .setZExtResult(!isSigned);
22889 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22890 return DAG.getBitcast(VT, CallInfo.first);
22893 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22894 SelectionDAG &DAG) {
22895 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22896 MVT VT = Op0.getSimpleValueType();
22899 // Decompose 256-bit ops into smaller 128-bit ops.
22900 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22901 unsigned Opcode = Op.getOpcode();
22902 unsigned NumElems = VT.getVectorNumElements();
22903 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22904 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22905 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22906 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22907 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22908 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22909 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22911 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22912 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22914 return DAG.getMergeValues(Ops, dl);
22917 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22918 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22919 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22921 int NumElts = VT.getVectorNumElements();
22923 // PMULxD operations multiply each even value (starting at 0) of LHS with
22924 // the related value of RHS and produce a widen result.
22925 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22926 // => <2 x i64> <ae|cg>
22928 // In other word, to have all the results, we need to perform two PMULxD:
22929 // 1. one with the even values.
22930 // 2. one with the odd values.
22931 // To achieve #2, with need to place the odd values at an even position.
22933 // Place the odd value at an even position (basically, shift all values 1
22934 // step to the left):
22935 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22936 // <a|b|c|d> => <b|undef|d|undef>
22937 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22938 makeArrayRef(&Mask[0], NumElts));
22939 // <e|f|g|h> => <f|undef|h|undef>
22940 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22941 makeArrayRef(&Mask[0], NumElts));
22943 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22945 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22946 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22948 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22949 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22950 // => <2 x i64> <ae|cg>
22951 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22952 DAG.getBitcast(MulVT, Op0),
22953 DAG.getBitcast(MulVT, Op1)));
22954 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22955 // => <2 x i64> <bf|dh>
22956 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22957 DAG.getBitcast(MulVT, Odd0),
22958 DAG.getBitcast(MulVT, Odd1)));
22960 // Shuffle it back into the right order.
22961 SmallVector<int, 16> HighMask(NumElts);
22962 SmallVector<int, 16> LowMask(NumElts);
22963 for (int i = 0; i != NumElts; ++i) {
22964 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22965 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22968 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22969 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22971 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22972 // unsigned multiply.
22973 if (IsSigned && !Subtarget.hasSSE41()) {
22974 SDValue ShAmt = DAG.getConstant(
22976 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22977 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22978 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22979 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22980 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22982 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22983 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22986 // The first result of MUL_LOHI is actually the low value, followed by the
22988 SDValue Ops[] = {Lows, Highs};
22989 return DAG.getMergeValues(Ops, dl);
22992 // Return true if the required (according to Opcode) shift-imm form is natively
22993 // supported by the Subtarget
22994 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22996 if (VT.getScalarSizeInBits() < 16)
22999 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
23000 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
23003 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
23004 (VT.is256BitVector() && Subtarget.hasInt256());
23006 bool AShift = LShift && (Subtarget.hasAVX512() ||
23007 (VT != MVT::v2i64 && VT != MVT::v4i64));
23008 return (Opcode == ISD::SRA) ? AShift : LShift;
23011 // The shift amount is a variable, but it is the same for all vector lanes.
23012 // These instructions are defined together with shift-immediate.
23014 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
23016 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
23019 // Return true if the required (according to Opcode) variable-shift form is
23020 // natively supported by the Subtarget
23021 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
23024 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
23027 // vXi16 supported only on AVX-512, BWI
23028 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
23031 if (Subtarget.hasAVX512())
23034 bool LShift = VT.is128BitVector() || VT.is256BitVector();
23035 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
23036 return (Opcode == ISD::SRA) ? AShift : LShift;
23039 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
23040 const X86Subtarget &Subtarget) {
23041 MVT VT = Op.getSimpleValueType();
23043 SDValue R = Op.getOperand(0);
23044 SDValue Amt = Op.getOperand(1);
23046 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23047 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23049 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
23050 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
23051 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
23052 SDValue Ex = DAG.getBitcast(ExVT, R);
23054 // ashr(R, 63) === cmp_slt(R, 0)
23055 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
23056 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
23057 "Unsupported PCMPGT op");
23058 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
23059 getZeroVector(VT, Subtarget, DAG, dl), R);
23062 if (ShiftAmt >= 32) {
23063 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
23065 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
23066 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23067 ShiftAmt - 32, DAG);
23068 if (VT == MVT::v2i64)
23069 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
23070 if (VT == MVT::v4i64)
23071 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23072 {9, 1, 11, 3, 13, 5, 15, 7});
23074 // SRA upper i32, SHL whole i64 and select lower i32.
23075 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23078 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
23079 Lower = DAG.getBitcast(ExVT, Lower);
23080 if (VT == MVT::v2i64)
23081 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
23082 if (VT == MVT::v4i64)
23083 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23084 {8, 1, 10, 3, 12, 5, 14, 7});
23086 return DAG.getBitcast(VT, Ex);
23089 // Optimize shl/srl/sra with constant shift amount.
23090 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23091 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
23092 uint64_t ShiftAmt = ShiftConst->getZExtValue();
23094 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23095 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23097 // i64 SRA needs to be performed as partial shifts.
23098 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
23099 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
23100 Op.getOpcode() == ISD::SRA)
23101 return ArithmeticShiftRight64(ShiftAmt);
23103 if (VT == MVT::v16i8 ||
23104 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
23105 VT == MVT::v64i8) {
23106 unsigned NumElts = VT.getVectorNumElements();
23107 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23109 // Simple i8 add case
23110 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
23111 return DAG.getNode(ISD::ADD, dl, VT, R, R);
23113 // ashr(R, 7) === cmp_slt(R, 0)
23114 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
23115 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
23116 if (VT.is512BitVector()) {
23117 assert(VT == MVT::v64i8 && "Unexpected element type!");
23118 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
23120 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
23122 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23125 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23126 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23129 if (Op.getOpcode() == ISD::SHL) {
23130 // Make a large shift.
23131 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23133 SHL = DAG.getBitcast(VT, SHL);
23134 // Zero out the rightmost bits.
23135 return DAG.getNode(ISD::AND, dl, VT, SHL,
23136 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23138 if (Op.getOpcode() == ISD::SRL) {
23139 // Make a large shift.
23140 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23142 SRL = DAG.getBitcast(VT, SRL);
23143 // Zero out the leftmost bits.
23144 return DAG.getNode(ISD::AND, dl, VT, SRL,
23145 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23147 if (Op.getOpcode() == ISD::SRA) {
23148 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23149 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23151 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23152 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23153 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23156 llvm_unreachable("Unknown shift opcode.");
23161 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23162 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23163 if (!Subtarget.hasXOP() &&
23164 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23165 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23167 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23168 unsigned SubVectorScale = 1;
23169 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23171 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23172 Amt = Amt.getOperand(0);
23175 // Peek through any splat that was introduced for i64 shift vectorization.
23176 int SplatIndex = -1;
23177 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23178 if (SVN->isSplat()) {
23179 SplatIndex = SVN->getSplatIndex();
23180 Amt = Amt.getOperand(0);
23181 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23182 "Splat shuffle referencing second operand");
23185 if (Amt.getOpcode() != ISD::BITCAST ||
23186 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23189 Amt = Amt.getOperand(0);
23190 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23191 (SubVectorScale * VT.getVectorNumElements());
23192 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23193 uint64_t ShiftAmt = 0;
23194 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23195 for (unsigned i = 0; i != Ratio; ++i) {
23196 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23200 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23203 // Check remaining shift amounts (if not a splat).
23204 if (SplatIndex < 0) {
23205 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23206 uint64_t ShAmt = 0;
23207 for (unsigned j = 0; j != Ratio; ++j) {
23208 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23212 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23214 if (ShAmt != ShiftAmt)
23219 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23220 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23222 if (Op.getOpcode() == ISD::SRA)
23223 return ArithmeticShiftRight64(ShiftAmt);
23229 // Determine if V is a splat value, and return the scalar.
23230 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
23231 SelectionDAG &DAG, const X86Subtarget &Subtarget,
23233 V = peekThroughEXTRACT_SUBVECTORs(V);
23235 // Check if this is a splat build_vector node.
23236 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
23237 SDValue SplatAmt = BV->getSplatValue();
23238 if (SplatAmt && SplatAmt.isUndef())
23243 // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
23244 if (V.getOpcode() == ISD::SUB &&
23245 !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
23246 SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
23247 SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
23249 // Ensure that the corresponding splat BV element is not UNDEF.
23250 BitVector UndefElts;
23251 BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
23252 ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23253 if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
23254 unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
23255 if (!UndefElts[SplatIdx])
23256 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23257 VT.getVectorElementType(), V,
23258 DAG.getIntPtrConstant(SplatIdx, dl));
23262 // Check if this is a shuffle node doing a splat.
23263 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
23264 if (!SVN || !SVN->isSplat())
23267 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23268 SDValue InVec = V.getOperand(0);
23269 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23270 assert((SplatIdx < VT.getVectorNumElements()) &&
23271 "Unexpected shuffle index found!");
23272 return InVec.getOperand(SplatIdx);
23273 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23274 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
23275 if (C->getZExtValue() == SplatIdx)
23276 return InVec.getOperand(1);
23279 // Avoid introducing an extract element from a shuffle.
23280 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23281 VT.getVectorElementType(), InVec,
23282 DAG.getIntPtrConstant(SplatIdx, dl));
23285 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23286 const X86Subtarget &Subtarget) {
23287 MVT VT = Op.getSimpleValueType();
23289 SDValue R = Op.getOperand(0);
23290 SDValue Amt = Op.getOperand(1);
23291 unsigned Opcode = Op.getOpcode();
23293 unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
23294 (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23296 unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
23297 (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23299 Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
23301 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
23302 if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
23303 MVT EltVT = VT.getVectorElementType();
23304 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23305 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23307 else if (EltVT.bitsLT(MVT::i32))
23308 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23310 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23314 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23315 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23316 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23317 Amt = Amt.getOperand(0);
23318 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23319 VT.getVectorNumElements();
23320 std::vector<SDValue> Vals(Ratio);
23321 for (unsigned i = 0; i != Ratio; ++i)
23322 Vals[i] = Amt.getOperand(i);
23323 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23324 for (unsigned j = 0; j != Ratio; ++j)
23325 if (Vals[j] != Amt.getOperand(i + j))
23329 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23330 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23335 // Convert a shift/rotate left amount to a multiplication scale factor.
23336 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
23337 const X86Subtarget &Subtarget,
23338 SelectionDAG &DAG) {
23339 MVT VT = Amt.getSimpleValueType();
23340 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
23341 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
23342 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
23345 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
23346 SmallVector<SDValue, 8> Elts;
23347 MVT SVT = VT.getVectorElementType();
23348 unsigned SVTBits = SVT.getSizeInBits();
23349 APInt One(SVTBits, 1);
23350 unsigned NumElems = VT.getVectorNumElements();
23352 for (unsigned i = 0; i != NumElems; ++i) {
23353 SDValue Op = Amt->getOperand(i);
23354 if (Op->isUndef()) {
23355 Elts.push_back(Op);
23359 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23360 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23361 uint64_t ShAmt = C.getZExtValue();
23362 if (ShAmt >= SVTBits) {
23363 Elts.push_back(DAG.getUNDEF(SVT));
23366 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23368 return DAG.getBuildVector(VT, dl, Elts);
23371 // If the target doesn't support variable shifts, use either FP conversion
23372 // or integer multiplication to avoid shifting each element individually.
23373 if (VT == MVT::v4i32) {
23374 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23375 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
23376 DAG.getConstant(0x3f800000U, dl, VT));
23377 Amt = DAG.getBitcast(MVT::v4f32, Amt);
23378 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
23381 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
23382 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
23383 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23384 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
23385 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
23386 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
23387 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
23388 if (Subtarget.hasSSE41())
23389 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23391 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
23392 DAG.getBitcast(VT, Hi),
23393 {0, 2, 4, 6, 8, 10, 12, 14});
23399 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23400 SelectionDAG &DAG) {
23401 MVT VT = Op.getSimpleValueType();
23403 SDValue R = Op.getOperand(0);
23404 SDValue Amt = Op.getOperand(1);
23405 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23407 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23408 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23410 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23413 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23416 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23419 // XOP has 128-bit variable logical/arithmetic shifts.
23420 // +ve/-ve Amt = shift left/right.
23421 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23422 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23423 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23424 SDValue Zero = DAG.getConstant(0, dl, VT);
23425 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23427 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23428 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23429 if (Op.getOpcode() == ISD::SRA)
23430 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23433 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23434 // shifts per-lane and then shuffle the partial results back together.
23435 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23436 // Splat the shift amounts so the scalar shifts above will catch it.
23437 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23438 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23439 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23440 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23441 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23444 // i64 vector arithmetic shift can be emulated with the transform:
23445 // M = lshr(SIGN_MASK, Amt)
23446 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23447 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23448 Op.getOpcode() == ISD::SRA) {
23449 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23450 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23451 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23452 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23453 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23457 // If possible, lower this shift as a sequence of two shifts by
23458 // constant plus a BLENDing shuffle instead of scalarizing it.
23460 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23462 // Could be rewritten as:
23463 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23465 // The advantage is that the two shifts from the example would be
23466 // lowered as X86ISD::VSRLI nodes in parallel before blending.
23467 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23468 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
23469 SDValue Amt1, Amt2;
23470 unsigned NumElts = VT.getVectorNumElements();
23471 SmallVector<int, 8> ShuffleMask;
23472 for (unsigned i = 0; i != NumElts; ++i) {
23473 SDValue A = Amt->getOperand(i);
23475 ShuffleMask.push_back(SM_SentinelUndef);
23478 if (!Amt1 || Amt1 == A) {
23479 ShuffleMask.push_back(i);
23483 if (!Amt2 || Amt2 == A) {
23484 ShuffleMask.push_back(i + NumElts);
23491 // Only perform this blend if we can perform it without loading a mask.
23492 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
23493 isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
23494 (VT != MVT::v16i16 ||
23495 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
23496 (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
23497 Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
23499 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23500 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23502 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23503 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23504 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
23508 // If possible, lower this packed shift into a vector multiply instead of
23509 // expanding it into a sequence of scalar shifts.
23510 if (Op.getOpcode() == ISD::SHL)
23511 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
23512 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
23514 // Constant ISD::SRL can be performed efficiently on vXi8/vXi16 vectors as we
23515 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
23516 // TODO: Improve support for the shift by zero special case.
23517 if (Op.getOpcode() == ISD::SRL && ConstantAmt &&
23518 ((Subtarget.hasSSE41() && VT == MVT::v8i16) ||
23519 DAG.isKnownNeverZero(Amt)) &&
23520 (VT == MVT::v16i8 || VT == MVT::v8i16 ||
23521 ((VT == MVT::v32i8 || VT == MVT::v16i16) && Subtarget.hasInt256()))) {
23522 SDValue EltBits = DAG.getConstant(VT.getScalarSizeInBits(), dl, VT);
23523 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
23524 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
23525 SDValue Zero = DAG.getConstant(0, dl, VT);
23526 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
23527 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
23528 return DAG.getSelect(dl, VT, ZAmt, R, Res);
23532 // v4i32 Non Uniform Shifts.
23533 // If the shift amount is constant we can shift each lane using the SSE2
23534 // immediate shifts, else we need to zero-extend each lane to the lower i64
23535 // and shift using the SSE2 variable shifts.
23536 // The separate results can then be blended together.
23537 if (VT == MVT::v4i32) {
23538 unsigned Opc = Op.getOpcode();
23539 SDValue Amt0, Amt1, Amt2, Amt3;
23541 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23542 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23543 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23544 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23546 // ISD::SHL is handled above but we include it here for completeness.
23549 llvm_unreachable("Unknown target vector shift node");
23551 Opc = X86ISD::VSHL;
23554 Opc = X86ISD::VSRL;
23557 Opc = X86ISD::VSRA;
23560 // The SSE2 shifts use the lower i64 as the same shift amount for
23561 // all lanes and the upper i64 is ignored. On AVX we're better off
23562 // just zero-extending, but for SSE just duplicating the top 16-bits is
23563 // cheaper and has the same effect for out of range values.
23564 if (Subtarget.hasAVX()) {
23565 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23566 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23567 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23568 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23569 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23571 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
23572 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23573 {4, 5, 6, 7, -1, -1, -1, -1});
23574 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23575 {0, 1, 1, 1, -1, -1, -1, -1});
23576 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23577 {2, 3, 3, 3, -1, -1, -1, -1});
23578 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23579 {0, 1, 1, 1, -1, -1, -1, -1});
23580 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23581 {2, 3, 3, 3, -1, -1, -1, -1});
23585 SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
23586 SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
23587 SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
23588 SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
23590 // Merge the shifted lane results optimally with/without PBLENDW.
23591 // TODO - ideally shuffle combining would handle this.
23592 if (Subtarget.hasSSE41()) {
23593 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23594 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23595 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23597 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
23598 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
23599 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
23602 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23603 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23604 // make the existing SSE solution better.
23605 // NOTE: We honor prefered vector width before promoting to 512-bits.
23606 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23607 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23608 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23609 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23610 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23611 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23612 "Unexpected vector type");
23613 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23614 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23616 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23617 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23618 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23619 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23620 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23623 if (VT == MVT::v16i8 ||
23624 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23625 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23626 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23627 unsigned ShiftOpcode = Op->getOpcode();
23629 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23630 if (VT.is512BitVector()) {
23631 // On AVX512BW targets we make use of the fact that VSELECT lowers
23632 // to a masked blend which selects bytes based just on the sign bit
23633 // extracted to a mask.
23634 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23635 V0 = DAG.getBitcast(VT, V0);
23636 V1 = DAG.getBitcast(VT, V1);
23637 Sel = DAG.getBitcast(VT, Sel);
23638 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
23640 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23641 } else if (Subtarget.hasSSE41()) {
23642 // On SSE41 targets we make use of the fact that VSELECT lowers
23643 // to PBLENDVB which selects bytes based just on the sign bit.
23644 V0 = DAG.getBitcast(VT, V0);
23645 V1 = DAG.getBitcast(VT, V1);
23646 Sel = DAG.getBitcast(VT, Sel);
23647 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23649 // On pre-SSE41 targets we test for the sign bit by comparing to
23650 // zero - a negative value will set all bits of the lanes to true
23651 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23652 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23653 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23654 return DAG.getSelect(dl, SelVT, C, V0, V1);
23657 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23658 // We can safely do this using i16 shifts as we're only interested in
23659 // the 3 lower bits of each byte.
23660 Amt = DAG.getBitcast(ExtVT, Amt);
23661 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23662 Amt = DAG.getBitcast(VT, Amt);
23664 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23665 // r = VSELECT(r, shift(r, 4), a);
23667 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23668 R = SignBitSelect(VT, Amt, M, R);
23671 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23673 // r = VSELECT(r, shift(r, 2), a);
23674 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23675 R = SignBitSelect(VT, Amt, M, R);
23678 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23680 // return VSELECT(r, shift(r, 1), a);
23681 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23682 R = SignBitSelect(VT, Amt, M, R);
23686 if (Op->getOpcode() == ISD::SRA) {
23687 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23688 // so we can correctly sign extend. We don't care what happens to the
23690 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23691 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23692 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23693 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23694 ALo = DAG.getBitcast(ExtVT, ALo);
23695 AHi = DAG.getBitcast(ExtVT, AHi);
23696 RLo = DAG.getBitcast(ExtVT, RLo);
23697 RHi = DAG.getBitcast(ExtVT, RHi);
23699 // r = VSELECT(r, shift(r, 4), a);
23700 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23701 DAG.getConstant(4, dl, ExtVT));
23702 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23703 DAG.getConstant(4, dl, ExtVT));
23704 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23705 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23708 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23709 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23711 // r = VSELECT(r, shift(r, 2), a);
23712 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23713 DAG.getConstant(2, dl, ExtVT));
23714 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23715 DAG.getConstant(2, dl, ExtVT));
23716 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23717 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23720 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23721 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23723 // r = VSELECT(r, shift(r, 1), a);
23724 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23725 DAG.getConstant(1, dl, ExtVT));
23726 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23727 DAG.getConstant(1, dl, ExtVT));
23728 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23729 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23731 // Logical shift the result back to the lower byte, leaving a zero upper
23733 // meaning that we can safely pack with PACKUSWB.
23735 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23737 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23738 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23742 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23743 MVT ExtVT = MVT::v8i32;
23744 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23745 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23746 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23747 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23748 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23749 ALo = DAG.getBitcast(ExtVT, ALo);
23750 AHi = DAG.getBitcast(ExtVT, AHi);
23751 RLo = DAG.getBitcast(ExtVT, RLo);
23752 RHi = DAG.getBitcast(ExtVT, RHi);
23753 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23754 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23755 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23756 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23757 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23760 if (VT == MVT::v8i16) {
23761 unsigned ShiftOpcode = Op->getOpcode();
23763 // If we have a constant shift amount, the non-SSE41 path is best as
23764 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23765 bool UseSSE41 = Subtarget.hasSSE41() &&
23766 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23768 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23769 // On SSE41 targets we make use of the fact that VSELECT lowers
23770 // to PBLENDVB which selects bytes based just on the sign bit.
23772 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23773 V0 = DAG.getBitcast(ExtVT, V0);
23774 V1 = DAG.getBitcast(ExtVT, V1);
23775 Sel = DAG.getBitcast(ExtVT, Sel);
23776 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23778 // On pre-SSE41 targets we splat the sign bit - a negative value will
23779 // set all bits of the lanes to true and VSELECT uses that in
23780 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23782 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23783 return DAG.getSelect(dl, VT, C, V0, V1);
23786 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23788 // On SSE41 targets we need to replicate the shift mask in both
23789 // bytes for PBLENDVB.
23792 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23793 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23795 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23798 // r = VSELECT(r, shift(r, 8), a);
23799 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23800 R = SignBitSelect(Amt, M, R);
23803 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23805 // r = VSELECT(r, shift(r, 4), a);
23806 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23807 R = SignBitSelect(Amt, M, R);
23810 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23812 // r = VSELECT(r, shift(r, 2), a);
23813 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23814 R = SignBitSelect(Amt, M, R);
23817 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23819 // return VSELECT(r, shift(r, 1), a);
23820 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23821 R = SignBitSelect(Amt, M, R);
23825 // Decompose 256-bit shifts into smaller 128-bit shifts.
23826 if (VT.is256BitVector())
23827 return Lower256IntArith(Op, DAG);
23832 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23833 SelectionDAG &DAG) {
23834 MVT VT = Op.getSimpleValueType();
23835 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23838 SDValue R = Op.getOperand(0);
23839 SDValue Amt = Op.getOperand(1);
23840 unsigned Opcode = Op.getOpcode();
23841 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23843 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
23844 // Attempt to rotate by immediate.
23846 SmallVector<APInt, 16> EltBits;
23847 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23848 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23849 return EltBits[0] == V;
23851 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23852 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23853 return DAG.getNode(Op, DL, VT, R,
23854 DAG.getConstant(RotateAmt, DL, MVT::i8));
23858 // Else, fall-back on VPROLV/VPRORV.
23862 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23864 // XOP has 128-bit vector variable + immediate rotates.
23865 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23866 if (Subtarget.hasXOP()) {
23867 // Split 256-bit integers.
23868 if (VT.is256BitVector())
23869 return Lower256IntArith(Op, DAG);
23870 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23872 // Attempt to rotate by immediate.
23873 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23874 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23875 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23876 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23877 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23878 DAG.getConstant(RotateAmt, DL, MVT::i8));
23882 // Use general rotate by variable (per-element).
23886 // Split 256-bit integers on pre-AVX2 targets.
23887 if (VT.is256BitVector() && !Subtarget.hasAVX2())
23888 return Lower256IntArith(Op, DAG);
23890 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
23891 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
23892 Subtarget.hasAVX2())) &&
23893 "Only vXi32/vXi16/vXi8 vector rotates supported");
23895 // Rotate by an uniform constant - expand back to shifts.
23896 // TODO - legalizers should be able to handle this.
23897 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23898 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23899 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23900 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23901 if (RotateAmt == 0)
23904 SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
23905 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23906 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23907 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23911 // Rotate by splat - expand back to shifts.
23912 // TODO - legalizers should be able to handle this.
23913 if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
23914 IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
23915 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23916 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23917 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23918 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23919 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23922 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
23924 if (EltSizeInBits == 8) {
23925 if (Subtarget.hasBWI()) {
23926 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23927 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23928 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23929 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23930 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23933 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23935 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23936 if (Subtarget.hasSSE41()) {
23937 // On SSE41 targets we make use of the fact that VSELECT lowers
23938 // to PBLENDVB which selects bytes based just on the sign bit.
23939 V0 = DAG.getBitcast(VT, V0);
23940 V1 = DAG.getBitcast(VT, V1);
23941 Sel = DAG.getBitcast(VT, Sel);
23942 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
23944 // On pre-SSE41 targets we test for the sign bit by comparing to
23945 // zero - a negative value will set all bits of the lanes to true
23946 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23947 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
23948 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
23949 return DAG.getSelect(DL, SelVT, C, V0, V1);
23952 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23953 // We can safely do this using i16 shifts as we're only interested in
23954 // the 3 lower bits of each byte.
23955 Amt = DAG.getBitcast(ExtVT, Amt);
23956 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
23957 Amt = DAG.getBitcast(VT, Amt);
23959 // r = VSELECT(r, rot(r, 4), a);
23963 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
23964 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
23965 R = SignBitSelect(VT, Amt, M, R);
23968 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23970 // r = VSELECT(r, rot(r, 2), a);
23973 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
23974 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
23975 R = SignBitSelect(VT, Amt, M, R);
23978 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23980 // return VSELECT(r, rot(r, 1), a);
23983 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
23984 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
23985 return SignBitSelect(VT, Amt, M, R);
23988 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23989 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
23990 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
23992 // Best to fallback for all supported variable shifts.
23993 // AVX2 - best to fallback for non-constants as well.
23994 // TODO - legalizers should be able to handle this.
23995 if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
23996 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23997 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23998 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23999 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
24000 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
24003 // As with shifts, convert the rotation amount to a multiplication factor.
24004 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
24005 assert(Scale && "Failed to convert ROTL amount to scale");
24007 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
24008 if (EltSizeInBits == 16) {
24009 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
24010 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
24011 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24014 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
24015 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
24016 // that can then be OR'd with the lower 32-bits.
24017 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
24018 static const int OddMask[] = {1, -1, 3, -1};
24019 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
24020 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
24022 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24023 DAG.getBitcast(MVT::v2i64, R),
24024 DAG.getBitcast(MVT::v2i64, Scale));
24025 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24026 DAG.getBitcast(MVT::v2i64, R13),
24027 DAG.getBitcast(MVT::v2i64, Scale13));
24028 Res02 = DAG.getBitcast(VT, Res02);
24029 Res13 = DAG.getBitcast(VT, Res13);
24031 return DAG.getNode(ISD::OR, DL, VT,
24032 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
24033 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
24036 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24037 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24038 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24039 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24040 // has only one use.
24041 SDNode *N = Op.getNode();
24042 SDValue LHS = N->getOperand(0);
24043 SDValue RHS = N->getOperand(1);
24044 unsigned BaseOp = 0;
24045 X86::CondCode Cond;
24047 switch (Op.getOpcode()) {
24048 default: llvm_unreachable("Unknown ovf instruction!");
24050 // A subtract of one will be selected as a INC. Note that INC doesn't
24051 // set CF, so we can't do this for UADDO.
24052 if (isOneConstant(RHS)) {
24053 BaseOp = X86ISD::INC;
24054 Cond = X86::COND_O;
24057 BaseOp = X86ISD::ADD;
24058 Cond = X86::COND_O;
24061 BaseOp = X86ISD::ADD;
24062 Cond = X86::COND_B;
24065 // A subtract of one will be selected as a DEC. Note that DEC doesn't
24066 // set CF, so we can't do this for USUBO.
24067 if (isOneConstant(RHS)) {
24068 BaseOp = X86ISD::DEC;
24069 Cond = X86::COND_O;
24072 BaseOp = X86ISD::SUB;
24073 Cond = X86::COND_O;
24076 BaseOp = X86ISD::SUB;
24077 Cond = X86::COND_B;
24080 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
24081 Cond = X86::COND_O;
24083 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
24084 if (N->getValueType(0) == MVT::i8) {
24085 BaseOp = X86ISD::UMUL8;
24086 Cond = X86::COND_O;
24089 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
24091 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
24093 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
24095 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24099 // Also sets EFLAGS.
24100 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
24101 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24103 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
24105 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24108 /// Returns true if the operand type is exactly twice the native width, and
24109 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
24110 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
24111 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
24112 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
24113 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
24116 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
24117 else if (OpWidth == 128)
24118 return Subtarget.hasCmpxchg16b();
24123 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
24124 return needsCmpXchgNb(SI->getValueOperand()->getType());
24127 // Note: this turns large loads into lock cmpxchg8b/16b.
24128 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
24129 TargetLowering::AtomicExpansionKind
24130 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
24131 auto PTy = cast<PointerType>(LI->getPointerOperandType());
24132 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
24133 : AtomicExpansionKind::None;
24136 TargetLowering::AtomicExpansionKind
24137 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
24138 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24139 Type *MemType = AI->getType();
24141 // If the operand is too big, we must see if cmpxchg8/16b is available
24142 // and default to library calls otherwise.
24143 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
24144 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
24145 : AtomicExpansionKind::None;
24148 AtomicRMWInst::BinOp Op = AI->getOperation();
24151 llvm_unreachable("Unknown atomic operation");
24152 case AtomicRMWInst::Xchg:
24153 case AtomicRMWInst::Add:
24154 case AtomicRMWInst::Sub:
24155 // It's better to use xadd, xsub or xchg for these in all cases.
24156 return AtomicExpansionKind::None;
24157 case AtomicRMWInst::Or:
24158 case AtomicRMWInst::And:
24159 case AtomicRMWInst::Xor:
24160 // If the atomicrmw's result isn't actually used, we can just add a "lock"
24161 // prefix to a normal instruction for these operations.
24162 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
24163 : AtomicExpansionKind::None;
24164 case AtomicRMWInst::Nand:
24165 case AtomicRMWInst::Max:
24166 case AtomicRMWInst::Min:
24167 case AtomicRMWInst::UMax:
24168 case AtomicRMWInst::UMin:
24169 // These always require a non-trivial set of data operations on x86. We must
24170 // use a cmpxchg loop.
24171 return AtomicExpansionKind::CmpXChg;
24176 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
24177 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24178 Type *MemType = AI->getType();
24179 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
24180 // there is no benefit in turning such RMWs into loads, and it is actually
24181 // harmful as it introduces a mfence.
24182 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
24185 auto Builder = IRBuilder<>(AI);
24186 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
24187 auto SSID = AI->getSyncScopeID();
24188 // We must restrict the ordering to avoid generating loads with Release or
24189 // ReleaseAcquire orderings.
24190 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
24191 auto Ptr = AI->getPointerOperand();
24193 // Before the load we need a fence. Here is an example lifted from
24194 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
24197 // x.store(1, relaxed);
24198 // r1 = y.fetch_add(0, release);
24200 // y.fetch_add(42, acquire);
24201 // r2 = x.load(relaxed);
24202 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
24203 // lowered to just a load without a fence. A mfence flushes the store buffer,
24204 // making the optimization clearly correct.
24205 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
24206 // otherwise, we might be able to be more aggressive on relaxed idempotent
24207 // rmw. In practice, they do not look useful, so we don't try to be
24208 // especially clever.
24209 if (SSID == SyncScope::SingleThread)
24210 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
24211 // the IR level, so we must wrap it in an intrinsic.
24214 if (!Subtarget.hasMFence())
24215 // FIXME: it might make sense to use a locked operation here but on a
24216 // different cache-line to prevent cache-line bouncing. In practice it
24217 // is probably a small win, and x86 processors without mfence are rare
24218 // enough that we do not bother.
24222 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
24223 Builder.CreateCall(MFence, {});
24225 // Finally we can emit the atomic load.
24226 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
24227 AI->getType()->getPrimitiveSizeInBits());
24228 Loaded->setAtomic(Order, SSID);
24229 AI->replaceAllUsesWith(Loaded);
24230 AI->eraseFromParent();
24234 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
24235 SelectionDAG &DAG) {
24237 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
24238 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
24239 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
24240 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
24242 // The only fence that needs an instruction is a sequentially-consistent
24243 // cross-thread fence.
24244 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
24245 FenceSSID == SyncScope::System) {
24246 if (Subtarget.hasMFence())
24247 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
24249 SDValue Chain = Op.getOperand(0);
24250 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
24252 DAG.getRegister(X86::ESP, MVT::i32), // Base
24253 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
24254 DAG.getRegister(0, MVT::i32), // Index
24255 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
24256 DAG.getRegister(0, MVT::i32), // Segment.
24260 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
24261 return SDValue(Res, 0);
24264 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
24265 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
24268 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
24269 SelectionDAG &DAG) {
24270 MVT T = Op.getSimpleValueType();
24274 switch(T.SimpleTy) {
24275 default: llvm_unreachable("Invalid value type!");
24276 case MVT::i8: Reg = X86::AL; size = 1; break;
24277 case MVT::i16: Reg = X86::AX; size = 2; break;
24278 case MVT::i32: Reg = X86::EAX; size = 4; break;
24280 assert(Subtarget.is64Bit() && "Node not type legal!");
24281 Reg = X86::RAX; size = 8;
24284 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
24285 Op.getOperand(2), SDValue());
24286 SDValue Ops[] = { cpIn.getValue(0),
24289 DAG.getTargetConstant(size, DL, MVT::i8),
24290 cpIn.getValue(1) };
24291 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24292 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
24293 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
24297 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
24298 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
24299 MVT::i32, cpOut.getValue(2));
24300 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
24302 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
24303 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
24304 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
24308 // Create MOVMSKB, taking into account whether we need to split for AVX1.
24309 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
24310 const X86Subtarget &Subtarget) {
24311 MVT InVT = V.getSimpleValueType();
24313 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
24315 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
24316 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
24317 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
24318 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
24319 DAG.getConstant(16, DL, MVT::i8));
24320 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
24323 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24326 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
24327 SelectionDAG &DAG) {
24328 SDValue Src = Op.getOperand(0);
24329 MVT SrcVT = Src.getSimpleValueType();
24330 MVT DstVT = Op.getSimpleValueType();
24332 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
24333 // half to v32i1 and concatenating the result.
24334 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
24335 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
24336 assert(Subtarget.hasBWI() && "Expected BWI target");
24338 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24339 DAG.getIntPtrConstant(0, dl));
24340 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24341 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24342 DAG.getIntPtrConstant(1, dl));
24343 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24344 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24347 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24348 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
24349 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
24352 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
24353 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
24354 DstVT.getVectorNumElements() / 2);
24355 Lo = DAG.getBitcast(CastVT, Lo);
24356 Hi = DAG.getBitcast(CastVT, Hi);
24357 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
24360 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24361 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24362 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24363 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24365 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24366 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24367 return DAG.getZExtOrTrunc(V, DL, DstVT);
24370 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24371 SrcVT == MVT::i64) {
24372 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24373 if (DstVT != MVT::f64)
24374 // This conversion needs to be expanded.
24377 SmallVector<SDValue, 16> Elts;
24381 if (SrcVT.isVector()) {
24382 NumElts = SrcVT.getVectorNumElements();
24383 SVT = SrcVT.getVectorElementType();
24385 // Widen the vector in input in the case of MVT::v2i32.
24386 // Example: from MVT::v2i32 to MVT::v4i32.
24387 for (unsigned i = 0, e = NumElts; i != e; ++i)
24388 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24389 DAG.getIntPtrConstant(i, dl)));
24391 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24392 "Unexpected source type in LowerBITCAST");
24393 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24394 DAG.getIntPtrConstant(0, dl)));
24395 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24396 DAG.getIntPtrConstant(1, dl)));
24400 // Explicitly mark the extra elements as Undef.
24401 Elts.append(NumElts, DAG.getUNDEF(SVT));
24403 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24404 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24405 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24406 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24407 DAG.getIntPtrConstant(0, dl));
24410 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24411 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24412 assert((DstVT == MVT::i64 ||
24413 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24414 "Unexpected custom BITCAST");
24415 // i64 <=> MMX conversions are Legal.
24416 if (SrcVT==MVT::i64 && DstVT.isVector())
24418 if (DstVT==MVT::i64 && SrcVT.isVector())
24420 // MMX <=> MMX conversions are Legal.
24421 if (SrcVT.isVector() && DstVT.isVector())
24423 // All other conversions need to be expanded.
24427 /// Compute the horizontal sum of bytes in V for the elements of VT.
24429 /// Requires V to be a byte vector and VT to be an integer vector type with
24430 /// wider elements than V's type. The width of the elements of VT determines
24431 /// how many bytes of V are summed horizontally to produce each element of the
24433 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24434 const X86Subtarget &Subtarget,
24435 SelectionDAG &DAG) {
24437 MVT ByteVecVT = V.getSimpleValueType();
24438 MVT EltVT = VT.getVectorElementType();
24439 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24440 "Expected value to have byte element type.");
24441 assert(EltVT != MVT::i8 &&
24442 "Horizontal byte sum only makes sense for wider elements!");
24443 unsigned VecSize = VT.getSizeInBits();
24444 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24446 // PSADBW instruction horizontally add all bytes and leave the result in i64
24447 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24448 if (EltVT == MVT::i64) {
24449 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24450 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24451 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24452 return DAG.getBitcast(VT, V);
24455 if (EltVT == MVT::i32) {
24456 // We unpack the low half and high half into i32s interleaved with zeros so
24457 // that we can use PSADBW to horizontally sum them. The most useful part of
24458 // this is that it lines up the results of two PSADBW instructions to be
24459 // two v2i64 vectors which concatenated are the 4 population counts. We can
24460 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24461 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24462 SDValue V32 = DAG.getBitcast(VT, V);
24463 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24464 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24466 // Do the horizontal sums into two v2i64s.
24467 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24468 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24469 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24470 DAG.getBitcast(ByteVecVT, Low), Zeros);
24471 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24472 DAG.getBitcast(ByteVecVT, High), Zeros);
24474 // Merge them together.
24475 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24476 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24477 DAG.getBitcast(ShortVecVT, Low),
24478 DAG.getBitcast(ShortVecVT, High));
24480 return DAG.getBitcast(VT, V);
24483 // The only element type left is i16.
24484 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24486 // To obtain pop count for each i16 element starting from the pop count for
24487 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24488 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24489 // directly supported.
24490 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24491 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24492 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24493 DAG.getBitcast(ByteVecVT, V));
24494 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24497 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24498 const X86Subtarget &Subtarget,
24499 SelectionDAG &DAG) {
24500 MVT VT = Op.getSimpleValueType();
24501 MVT EltVT = VT.getVectorElementType();
24502 unsigned VecSize = VT.getSizeInBits();
24504 // Implement a lookup table in register by using an algorithm based on:
24505 // http://wm.ite.pl/articles/sse-popcount.html
24507 // The general idea is that every lower byte nibble in the input vector is an
24508 // index into a in-register pre-computed pop count table. We then split up the
24509 // input vector in two new ones: (1) a vector with only the shifted-right
24510 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24511 // masked out higher ones) for each byte. PSHUFB is used separately with both
24512 // to index the in-register table. Next, both are added and the result is a
24513 // i8 vector where each element contains the pop count for input byte.
24515 // To obtain the pop count for elements != i8, we follow up with the same
24516 // approach and use additional tricks as described below.
24518 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24519 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24520 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24521 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24523 int NumByteElts = VecSize / 8;
24524 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24525 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24526 SmallVector<SDValue, 64> LUTVec;
24527 for (int i = 0; i < NumByteElts; ++i)
24528 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24529 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24530 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24533 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24534 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24537 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24539 // The input vector is used as the shuffle mask that index elements into the
24540 // LUT. After counting low and high nibbles, add the vector to obtain the
24541 // final pop count per i8 element.
24542 SDValue HighPopCnt =
24543 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24544 SDValue LowPopCnt =
24545 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24546 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24548 if (EltVT == MVT::i8)
24551 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24554 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24555 const X86Subtarget &Subtarget,
24556 SelectionDAG &DAG) {
24557 MVT VT = Op.getSimpleValueType();
24558 assert(VT.is128BitVector() &&
24559 "Only 128-bit vector bitmath lowering supported.");
24561 int VecSize = VT.getSizeInBits();
24562 MVT EltVT = VT.getVectorElementType();
24563 int Len = EltVT.getSizeInBits();
24565 // This is the vectorized version of the "best" algorithm from
24566 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24567 // with a minor tweak to use a series of adds + shifts instead of vector
24568 // multiplications. Implemented for all integer vector types. We only use
24569 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24570 // much faster, even faster than using native popcnt instructions.
24572 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24573 MVT VT = V.getSimpleValueType();
24574 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24575 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24577 auto GetMask = [&](SDValue V, APInt Mask) {
24578 MVT VT = V.getSimpleValueType();
24579 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24580 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24583 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24584 // x86, so set the SRL type to have elements at least i16 wide. This is
24585 // correct because all of our SRLs are followed immediately by a mask anyways
24586 // that handles any bits that sneak into the high bits of the byte elements.
24587 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24591 // v = v - ((v >> 1) & 0x55555555...)
24593 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24594 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24595 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24597 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24598 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24599 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24600 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24601 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24603 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24604 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24605 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24606 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24608 // At this point, V contains the byte-wise population count, and we are
24609 // merely doing a horizontal sum if necessary to get the wider element
24611 if (EltVT == MVT::i8)
24614 return LowerHorizontalByteSum(
24615 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24619 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24620 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24621 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24622 SelectionDAG &DAG) {
24623 MVT VT = Op.getSimpleValueType();
24624 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24625 "Unknown CTPOP type to handle");
24626 SDLoc DL(Op.getNode());
24627 SDValue Op0 = Op.getOperand(0);
24629 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24630 if (Subtarget.hasVPOPCNTDQ()) {
24631 unsigned NumElems = VT.getVectorNumElements();
24632 assert((VT.getVectorElementType() == MVT::i8 ||
24633 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24634 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24635 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24636 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24637 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24638 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24642 if (!Subtarget.hasSSSE3()) {
24643 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24644 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24645 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24648 // Decompose 256-bit ops into smaller 128-bit ops.
24649 if (VT.is256BitVector() && !Subtarget.hasInt256())
24650 return Lower256IntUnary(Op, DAG);
24652 // Decompose 512-bit ops into smaller 256-bit ops.
24653 if (VT.is512BitVector() && !Subtarget.hasBWI())
24654 return Lower512IntUnary(Op, DAG);
24656 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24659 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24660 SelectionDAG &DAG) {
24661 assert(Op.getSimpleValueType().isVector() &&
24662 "We only do custom lowering for vector population count.");
24663 return LowerVectorCTPOP(Op, Subtarget, DAG);
24666 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24667 MVT VT = Op.getSimpleValueType();
24668 SDValue In = Op.getOperand(0);
24671 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24672 // perform the BITREVERSE.
24673 if (!VT.isVector()) {
24674 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24675 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24676 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24677 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24678 DAG.getIntPtrConstant(0, DL));
24681 int NumElts = VT.getVectorNumElements();
24682 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24684 // Decompose 256-bit ops into smaller 128-bit ops.
24685 if (VT.is256BitVector())
24686 return Lower256IntUnary(Op, DAG);
24688 assert(VT.is128BitVector() &&
24689 "Only 128-bit vector bitreverse lowering supported.");
24691 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24692 // perform the BSWAP in the shuffle.
24693 // Its best to shuffle using the second operand as this will implicitly allow
24694 // memory folding for multiple vectors.
24695 SmallVector<SDValue, 16> MaskElts;
24696 for (int i = 0; i != NumElts; ++i) {
24697 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24698 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24699 int PermuteByte = SourceByte | (2 << 5);
24700 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24704 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24705 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24706 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24708 return DAG.getBitcast(VT, Res);
24711 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24712 SelectionDAG &DAG) {
24713 MVT VT = Op.getSimpleValueType();
24715 if (Subtarget.hasXOP() && !VT.is512BitVector())
24716 return LowerBITREVERSE_XOP(Op, DAG);
24718 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24720 SDValue In = Op.getOperand(0);
24723 unsigned NumElts = VT.getVectorNumElements();
24724 assert(VT.getScalarType() == MVT::i8 &&
24725 "Only byte vector BITREVERSE supported");
24727 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24728 if (VT.is256BitVector() && !Subtarget.hasInt256())
24729 return Lower256IntUnary(Op, DAG);
24731 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24732 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24733 // 0-15 value (moved to the other nibble).
24734 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24735 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24736 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24738 const int LoLUT[16] = {
24739 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24740 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24741 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24742 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24743 const int HiLUT[16] = {
24744 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24745 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24746 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24747 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24749 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24750 for (unsigned i = 0; i < NumElts; ++i) {
24751 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24752 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24755 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24756 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24757 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24758 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24759 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24762 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24763 const X86Subtarget &Subtarget,
24764 bool AllowIncDec = true) {
24765 unsigned NewOpc = 0;
24766 switch (N->getOpcode()) {
24767 case ISD::ATOMIC_LOAD_ADD:
24768 NewOpc = X86ISD::LADD;
24770 case ISD::ATOMIC_LOAD_SUB:
24771 NewOpc = X86ISD::LSUB;
24773 case ISD::ATOMIC_LOAD_OR:
24774 NewOpc = X86ISD::LOR;
24776 case ISD::ATOMIC_LOAD_XOR:
24777 NewOpc = X86ISD::LXOR;
24779 case ISD::ATOMIC_LOAD_AND:
24780 NewOpc = X86ISD::LAND;
24783 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24786 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24788 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24789 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24790 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24791 DAG.getMachineFunction().getFunction().optForSize())) {
24792 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24793 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24794 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24795 DAG.getVTList(MVT::i32, MVT::Other),
24796 {N->getOperand(0), N->getOperand(1)},
24797 /*MemVT=*/N->getSimpleValueType(0), MMO);
24798 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24799 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24800 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24801 DAG.getVTList(MVT::i32, MVT::Other),
24802 {N->getOperand(0), N->getOperand(1)},
24803 /*MemVT=*/N->getSimpleValueType(0), MMO);
24807 return DAG.getMemIntrinsicNode(
24808 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24809 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24810 /*MemVT=*/N->getSimpleValueType(0), MMO);
24813 /// Lower atomic_load_ops into LOCK-prefixed operations.
24814 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24815 const X86Subtarget &Subtarget) {
24816 SDValue Chain = N->getOperand(0);
24817 SDValue LHS = N->getOperand(1);
24818 SDValue RHS = N->getOperand(2);
24819 unsigned Opc = N->getOpcode();
24820 MVT VT = N->getSimpleValueType(0);
24823 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24824 // can only be lowered when the result is unused. They should have already
24825 // been transformed into a cmpxchg loop in AtomicExpand.
24826 if (N->hasAnyUseOfValue(0)) {
24827 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24828 // select LXADD if LOCK_SUB can't be selected.
24829 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24830 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24831 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24832 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24833 RHS, AN->getMemOperand());
24835 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24836 "Used AtomicRMW ops other than Add should have been expanded!");
24840 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24841 // RAUW the chain, but don't worry about the result, as it's unused.
24842 assert(!N->hasAnyUseOfValue(0));
24843 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24847 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24848 SDNode *Node = Op.getNode();
24850 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24852 // Convert seq_cst store -> xchg
24853 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24854 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24855 // (The only way to get a 16-byte store is cmpxchg16b)
24856 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24857 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24858 AtomicOrdering::SequentiallyConsistent ||
24859 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24860 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24861 cast<AtomicSDNode>(Node)->getMemoryVT(),
24862 Node->getOperand(0),
24863 Node->getOperand(1), Node->getOperand(2),
24864 cast<AtomicSDNode>(Node)->getMemOperand());
24865 return Swap.getValue(1);
24867 // Other atomic stores have a simple pattern.
24871 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24872 SDNode *N = Op.getNode();
24873 MVT VT = N->getSimpleValueType(0);
24875 // Let legalize expand this if it isn't a legal type yet.
24876 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24879 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24882 // Set the carry flag.
24883 SDValue Carry = Op.getOperand(2);
24884 EVT CarryVT = Carry.getValueType();
24885 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24886 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24887 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24889 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24890 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24891 Op.getOperand(1), Carry.getValue(1));
24893 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24894 if (N->getValueType(1) == MVT::i1)
24895 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24897 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24900 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24901 SelectionDAG &DAG) {
24902 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24904 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24905 // which returns the values as { float, float } (in XMM0) or
24906 // { double, double } (which is returned in XMM0, XMM1).
24908 SDValue Arg = Op.getOperand(0);
24909 EVT ArgVT = Arg.getValueType();
24910 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24912 TargetLowering::ArgListTy Args;
24913 TargetLowering::ArgListEntry Entry;
24917 Entry.IsSExt = false;
24918 Entry.IsZExt = false;
24919 Args.push_back(Entry);
24921 bool isF64 = ArgVT == MVT::f64;
24922 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24923 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24924 // the results are returned via SRet in memory.
24925 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24926 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24927 const char *LibcallName = TLI.getLibcallName(LC);
24929 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24931 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24932 : (Type *)VectorType::get(ArgTy, 4);
24934 TargetLowering::CallLoweringInfo CLI(DAG);
24935 CLI.setDebugLoc(dl)
24936 .setChain(DAG.getEntryNode())
24937 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24939 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24942 // Returned in xmm0 and xmm1.
24943 return CallResult.first;
24945 // Returned in bits 0:31 and 32:64 xmm0.
24946 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24947 CallResult.first, DAG.getIntPtrConstant(0, dl));
24948 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24949 CallResult.first, DAG.getIntPtrConstant(1, dl));
24950 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24951 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24954 /// Widen a vector input to a vector of NVT. The
24955 /// input vector must have the same element type as NVT.
24956 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24957 bool FillWithZeroes = false) {
24958 // Check if InOp already has the right width.
24959 MVT InVT = InOp.getSimpleValueType();
24963 if (InOp.isUndef())
24964 return DAG.getUNDEF(NVT);
24966 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24967 "input and widen element type must match");
24969 unsigned InNumElts = InVT.getVectorNumElements();
24970 unsigned WidenNumElts = NVT.getVectorNumElements();
24971 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24972 "Unexpected request for vector widening");
24975 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24976 InOp.getNumOperands() == 2) {
24977 SDValue N1 = InOp.getOperand(1);
24978 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24980 InOp = InOp.getOperand(0);
24981 InVT = InOp.getSimpleValueType();
24982 InNumElts = InVT.getVectorNumElements();
24985 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24986 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24987 SmallVector<SDValue, 16> Ops;
24988 for (unsigned i = 0; i < InNumElts; ++i)
24989 Ops.push_back(InOp.getOperand(i));
24991 EVT EltVT = InOp.getOperand(0).getValueType();
24993 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24994 DAG.getUNDEF(EltVT);
24995 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24996 Ops.push_back(FillVal);
24997 return DAG.getBuildVector(NVT, dl, Ops);
24999 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
25001 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
25002 InOp, DAG.getIntPtrConstant(0, dl));
25005 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
25006 SelectionDAG &DAG) {
25007 assert(Subtarget.hasAVX512() &&
25008 "MGATHER/MSCATTER are supported on AVX-512 arch only");
25010 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
25011 SDValue Src = N->getValue();
25012 MVT VT = Src.getSimpleValueType();
25013 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
25016 SDValue Scale = N->getScale();
25017 SDValue Index = N->getIndex();
25018 SDValue Mask = N->getMask();
25019 SDValue Chain = N->getChain();
25020 SDValue BasePtr = N->getBasePtr();
25022 if (VT == MVT::v2f32) {
25023 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25024 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25025 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25026 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25027 DAG.getUNDEF(MVT::v2f32));
25028 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25029 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25030 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25031 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25032 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25033 return SDValue(NewScatter.getNode(), 1);
25038 if (VT == MVT::v2i32) {
25039 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25040 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
25041 DAG.getUNDEF(MVT::v2i32));
25042 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25043 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25044 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25045 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25046 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25047 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25048 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25049 return SDValue(NewScatter.getNode(), 1);
25051 // Custom widen all the operands to avoid promotion.
25052 EVT NewIndexVT = EVT::getVectorVT(
25053 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
25054 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25055 DAG.getUNDEF(Index.getValueType()));
25056 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25057 DAG.getConstant(0, dl, MVT::v2i1));
25058 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25059 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
25060 Ops, N->getMemOperand());
25063 MVT IndexVT = Index.getSimpleValueType();
25064 MVT MaskVT = Mask.getSimpleValueType();
25066 // If the index is v2i32, we're being called by type legalization and we
25067 // should just let the default handling take care of it.
25068 if (IndexVT == MVT::v2i32)
25071 // If we don't have VLX and neither the passthru or index is 512-bits, we
25072 // need to widen until one is.
25073 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
25074 !Index.getSimpleValueType().is512BitVector()) {
25075 // Determine how much we need to widen by to get a 512-bit type.
25076 unsigned Factor = std::min(512/VT.getSizeInBits(),
25077 512/IndexVT.getSizeInBits());
25078 unsigned NumElts = VT.getVectorNumElements() * Factor;
25080 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25081 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25082 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25084 Src = ExtendToType(Src, VT, DAG);
25085 Index = ExtendToType(Index, IndexVT, DAG);
25086 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25089 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
25090 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25091 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25092 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25093 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25094 return SDValue(NewScatter.getNode(), 1);
25097 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
25098 SelectionDAG &DAG) {
25100 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
25101 MVT VT = Op.getSimpleValueType();
25102 MVT ScalarVT = VT.getScalarType();
25103 SDValue Mask = N->getMask();
25106 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
25107 "Expanding masked load is supported on AVX-512 target only!");
25109 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
25110 "Expanding masked load is supported for 32 and 64-bit types only!");
25112 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25113 "Cannot lower masked load op.");
25115 assert((ScalarVT.getSizeInBits() >= 32 ||
25116 (Subtarget.hasBWI() &&
25117 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25118 "Unsupported masked load op.");
25120 // This operation is legal for targets with VLX, but without
25121 // VLX the vector should be widened to 512 bit
25122 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
25123 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25124 SDValue Src0 = N->getSrc0();
25125 Src0 = ExtendToType(Src0, WideDataVT, DAG);
25127 // Mask element has to be i1.
25128 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25129 "Unexpected mask type");
25131 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25133 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25134 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
25135 N->getBasePtr(), Mask, Src0,
25136 N->getMemoryVT(), N->getMemOperand(),
25137 N->getExtensionType(),
25138 N->isExpandingLoad());
25140 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
25141 NewLoad.getValue(0),
25142 DAG.getIntPtrConstant(0, dl));
25143 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
25144 return DAG.getMergeValues(RetOps, dl);
25147 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
25148 SelectionDAG &DAG) {
25149 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
25150 SDValue DataToStore = N->getValue();
25151 MVT VT = DataToStore.getSimpleValueType();
25152 MVT ScalarVT = VT.getScalarType();
25153 SDValue Mask = N->getMask();
25156 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
25157 "Expanding masked load is supported on AVX-512 target only!");
25159 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
25160 "Expanding masked load is supported for 32 and 64-bit types only!");
25162 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25163 "Cannot lower masked store op.");
25165 assert((ScalarVT.getSizeInBits() >= 32 ||
25166 (Subtarget.hasBWI() &&
25167 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25168 "Unsupported masked store op.");
25170 // This operation is legal for targets with VLX, but without
25171 // VLX the vector should be widened to 512 bit
25172 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
25173 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25175 // Mask element has to be i1.
25176 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25177 "Unexpected mask type");
25179 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25181 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
25182 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25183 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
25184 Mask, N->getMemoryVT(), N->getMemOperand(),
25185 N->isTruncatingStore(), N->isCompressingStore());
25188 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
25189 SelectionDAG &DAG) {
25190 assert(Subtarget.hasAVX2() &&
25191 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
25193 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
25195 MVT VT = Op.getSimpleValueType();
25196 SDValue Index = N->getIndex();
25197 SDValue Mask = N->getMask();
25198 SDValue Src0 = N->getValue();
25199 MVT IndexVT = Index.getSimpleValueType();
25200 MVT MaskVT = Mask.getSimpleValueType();
25202 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
25204 // If the index is v2i32, we're being called by type legalization.
25205 if (IndexVT == MVT::v2i32)
25208 // If we don't have VLX and neither the passthru or index is 512-bits, we
25209 // need to widen until one is.
25211 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25212 !IndexVT.is512BitVector()) {
25213 // Determine how much we need to widen by to get a 512-bit type.
25214 unsigned Factor = std::min(512/VT.getSizeInBits(),
25215 512/IndexVT.getSizeInBits());
25217 unsigned NumElts = VT.getVectorNumElements() * Factor;
25219 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25220 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25221 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25223 Src0 = ExtendToType(Src0, VT, DAG);
25224 Index = ExtendToType(Index, IndexVT, DAG);
25225 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25228 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
25230 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25231 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
25232 N->getMemOperand());
25233 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
25234 NewGather, DAG.getIntPtrConstant(0, dl));
25235 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
25238 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
25239 SelectionDAG &DAG) const {
25240 // TODO: Eventually, the lowering of these nodes should be informed by or
25241 // deferred to the GC strategy for the function in which they appear. For
25242 // now, however, they must be lowered to something. Since they are logically
25243 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25244 // require special handling for these nodes), lower them as literal NOOPs for
25246 SmallVector<SDValue, 2> Ops;
25248 Ops.push_back(Op.getOperand(0));
25249 if (Op->getGluedNode())
25250 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25253 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25254 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25259 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
25260 SelectionDAG &DAG) const {
25261 // TODO: Eventually, the lowering of these nodes should be informed by or
25262 // deferred to the GC strategy for the function in which they appear. For
25263 // now, however, they must be lowered to something. Since they are logically
25264 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25265 // require special handling for these nodes), lower them as literal NOOPs for
25267 SmallVector<SDValue, 2> Ops;
25269 Ops.push_back(Op.getOperand(0));
25270 if (Op->getGluedNode())
25271 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25274 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25275 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25280 /// Provide custom lowering hooks for some operations.
25281 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
25282 switch (Op.getOpcode()) {
25283 default: llvm_unreachable("Should not custom lower this!");
25284 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
25285 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
25286 return LowerCMP_SWAP(Op, Subtarget, DAG);
25287 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
25288 case ISD::ATOMIC_LOAD_ADD:
25289 case ISD::ATOMIC_LOAD_SUB:
25290 case ISD::ATOMIC_LOAD_OR:
25291 case ISD::ATOMIC_LOAD_XOR:
25292 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
25293 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
25294 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
25295 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
25296 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
25297 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
25298 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
25299 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
25300 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
25301 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
25302 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
25303 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
25304 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
25305 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
25306 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
25307 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
25308 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
25309 case ISD::SHL_PARTS:
25310 case ISD::SRA_PARTS:
25311 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
25312 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
25313 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
25314 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
25315 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
25316 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
25317 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
25318 case ISD::ZERO_EXTEND_VECTOR_INREG:
25319 case ISD::SIGN_EXTEND_VECTOR_INREG:
25320 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
25321 case ISD::FP_TO_SINT:
25322 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
25323 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
25324 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
25325 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
25327 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
25328 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
25329 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
25330 case ISD::SETCC: return LowerSETCC(Op, DAG);
25331 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
25332 case ISD::SELECT: return LowerSELECT(Op, DAG);
25333 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
25334 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
25335 case ISD::VASTART: return LowerVASTART(Op, DAG);
25336 case ISD::VAARG: return LowerVAARG(Op, DAG);
25337 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
25338 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
25339 case ISD::INTRINSIC_VOID:
25340 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
25341 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
25342 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
25343 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
25344 case ISD::FRAME_TO_ARGS_OFFSET:
25345 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
25346 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
25347 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25348 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25349 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25350 case ISD::EH_SJLJ_SETUP_DISPATCH:
25351 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25352 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25353 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25354 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25356 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25358 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25359 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25361 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25362 case ISD::UMUL_LOHI:
25363 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25365 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25368 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25374 case ISD::UMULO: return LowerXALUO(Op, DAG);
25375 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25376 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25377 case ISD::ADDCARRY:
25378 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25380 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25384 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25385 case ISD::ABS: return LowerABS(Op, DAG);
25386 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25387 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25388 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25389 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25390 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25391 case ISD::GC_TRANSITION_START:
25392 return LowerGC_TRANSITION_START(Op, DAG);
25393 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25397 /// Places new result values for the node in Results (their number
25398 /// and types must exactly match those of the original return values of
25399 /// the node), or leaves Results empty, which indicates that the node is not
25400 /// to be custom lowered after all.
25401 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25402 SmallVectorImpl<SDValue> &Results,
25403 SelectionDAG &DAG) const {
25404 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25406 if (!Res.getNode())
25409 assert((N->getNumValues() <= Res->getNumValues()) &&
25410 "Lowering returned the wrong number of results!");
25412 // Places new result values base on N result number.
25413 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25414 // than original node, chain should be dropped(last value).
25415 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25416 Results.push_back(Res.getValue(I));
25419 /// Replace a node with an illegal result type with a new node built out of
25421 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25422 SmallVectorImpl<SDValue>&Results,
25423 SelectionDAG &DAG) const {
25425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25426 switch (N->getOpcode()) {
25428 llvm_unreachable("Do not know how to custom type legalize this operation!");
25429 case X86ISD::AVG: {
25430 // Legalize types for X86ISD::AVG by expanding vectors.
25431 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25433 auto InVT = N->getValueType(0);
25434 assert(InVT.getSizeInBits() < 128);
25435 assert(128 % InVT.getSizeInBits() == 0);
25436 unsigned NumConcat = 128 / InVT.getSizeInBits();
25438 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25439 InVT.getVectorElementType(),
25440 NumConcat * InVT.getVectorNumElements());
25442 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25443 Ops[0] = N->getOperand(0);
25444 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25445 Ops[0] = N->getOperand(1);
25446 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25448 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25449 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25450 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25451 DAG.getIntPtrConstant(0, dl));
25452 Results.push_back(Res);
25456 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25457 // setCC result type is v2i1 because type legalzation will end up with
25458 // a v4i1 setcc plus an extend.
25459 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25460 if (N->getOperand(0).getValueType() != MVT::v2f32)
25462 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25463 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25464 N->getOperand(0), UNDEF);
25465 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25466 N->getOperand(1), UNDEF);
25467 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25469 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25470 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25471 DAG.getIntPtrConstant(0, dl));
25472 Results.push_back(Res);
25475 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25476 case X86ISD::FMINC:
25478 case X86ISD::FMAXC:
25479 case X86ISD::FMAX: {
25480 EVT VT = N->getValueType(0);
25481 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25482 SDValue UNDEF = DAG.getUNDEF(VT);
25483 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25484 N->getOperand(0), UNDEF);
25485 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25486 N->getOperand(1), UNDEF);
25487 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25495 case ISD::UDIVREM: {
25496 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25497 Results.push_back(V);
25500 case ISD::FP_TO_SINT:
25501 case ISD::FP_TO_UINT: {
25502 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25503 EVT VT = N->getValueType(0);
25504 SDValue Src = N->getOperand(0);
25505 EVT SrcVT = Src.getValueType();
25507 if (VT == MVT::v2i32) {
25508 assert((IsSigned || Subtarget.hasAVX512()) &&
25509 "Can only handle signed conversion without AVX512");
25510 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25511 if (Src.getValueType() == MVT::v2f64) {
25512 MVT ResVT = MVT::v4i32;
25513 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25514 if (!IsSigned && !Subtarget.hasVLX()) {
25515 // Widen to 512-bits.
25516 ResVT = MVT::v8i32;
25517 Opc = ISD::FP_TO_UINT;
25518 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25519 DAG.getUNDEF(MVT::v8f64),
25520 Src, DAG.getIntPtrConstant(0, dl));
25522 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25523 bool WidenType = getTypeAction(*DAG.getContext(),
25524 MVT::v2i32) == TypeWidenVector;
25525 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25526 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25527 DAG.getIntPtrConstant(0, dl));
25528 Results.push_back(Res);
25531 if (SrcVT == MVT::v2f32) {
25532 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25533 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25534 DAG.getUNDEF(MVT::v2f32));
25535 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25536 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25537 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25538 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25539 Results.push_back(Res);
25543 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25544 // so early out here.
25548 if (Subtarget.hasDQI() && VT == MVT::i64 &&
25549 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
25550 assert(!Subtarget.is64Bit() && "i64 should be legal");
25551 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
25552 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
25553 // TODO: Use 128-bit vectors for f64 case?
25554 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
25555 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
25556 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
25558 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
25559 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
25560 DAG.getConstantFP(0.0, dl, VecInVT), Src,
25562 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
25563 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
25564 Results.push_back(Res);
25568 std::pair<SDValue,SDValue> Vals =
25569 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25570 SDValue FIST = Vals.first, StackSlot = Vals.second;
25571 if (FIST.getNode()) {
25572 // Return a load from the stack slot.
25573 if (StackSlot.getNode())
25575 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25577 Results.push_back(FIST);
25581 case ISD::SINT_TO_FP: {
25582 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25583 SDValue Src = N->getOperand(0);
25584 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25586 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25589 case ISD::UINT_TO_FP: {
25590 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25591 EVT VT = N->getValueType(0);
25592 if (VT != MVT::v2f32)
25594 SDValue Src = N->getOperand(0);
25595 EVT SrcVT = Src.getValueType();
25596 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25597 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25600 if (SrcVT != MVT::v2i32)
25602 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25604 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25605 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25606 DAG.getBitcast(MVT::v2i64, VBias));
25607 Or = DAG.getBitcast(MVT::v2f64, Or);
25608 // TODO: Are there any fast-math-flags to propagate here?
25609 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25610 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25613 case ISD::FP_ROUND: {
25614 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25616 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25617 Results.push_back(V);
25620 case ISD::FP_EXTEND: {
25621 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25622 // No other ValueType for FP_EXTEND should reach this point.
25623 assert(N->getValueType(0) == MVT::v2f32 &&
25624 "Do not know how to legalize this Node");
25627 case ISD::INTRINSIC_W_CHAIN: {
25628 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25630 default : llvm_unreachable("Do not know how to custom type "
25631 "legalize this intrinsic operation!");
25632 case Intrinsic::x86_rdtsc:
25633 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25635 case Intrinsic::x86_rdtscp:
25636 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25638 case Intrinsic::x86_rdpmc:
25639 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25641 case Intrinsic::x86_xgetbv:
25642 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25645 case ISD::INTRINSIC_WO_CHAIN: {
25646 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25647 Results.push_back(V);
25650 case ISD::READCYCLECOUNTER: {
25651 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25654 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25655 EVT T = N->getValueType(0);
25656 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25657 bool Regs64bit = T == MVT::i128;
25658 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25659 SDValue cpInL, cpInH;
25660 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25661 DAG.getConstant(0, dl, HalfT));
25662 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25663 DAG.getConstant(1, dl, HalfT));
25664 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25665 Regs64bit ? X86::RAX : X86::EAX,
25667 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25668 Regs64bit ? X86::RDX : X86::EDX,
25669 cpInH, cpInL.getValue(1));
25670 SDValue swapInL, swapInH;
25671 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25672 DAG.getConstant(0, dl, HalfT));
25673 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25674 DAG.getConstant(1, dl, HalfT));
25676 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25677 swapInH, cpInH.getValue(1));
25678 // If the current function needs the base pointer, RBX,
25679 // we shouldn't use cmpxchg directly.
25680 // Indeed the lowering of that instruction will clobber
25681 // that register and since RBX will be a reserved register
25682 // the register allocator will not make sure its value will
25683 // be properly saved and restored around this live-range.
25684 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25686 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25687 unsigned BasePtr = TRI->getBaseRegister();
25688 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25689 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25690 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25691 // ISel prefers the LCMPXCHG64 variant.
25692 // If that assert breaks, that means it is not the case anymore,
25693 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25694 // not just EBX. This is a matter of accepting i64 input for that
25695 // pseudo, and restoring into the register of the right wide
25696 // in expand pseudo. Everything else should just work.
25697 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25698 "Saving only half of the RBX");
25699 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25700 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25701 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25702 Regs64bit ? X86::RBX : X86::EBX,
25703 HalfT, swapInH.getValue(1));
25704 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25706 /*Glue*/ RBXSave.getValue(2)};
25707 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25710 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25711 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25712 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25713 swapInH.getValue(1));
25714 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25715 swapInL.getValue(1)};
25716 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25718 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25719 Regs64bit ? X86::RAX : X86::EAX,
25720 HalfT, Result.getValue(1));
25721 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25722 Regs64bit ? X86::RDX : X86::EDX,
25723 HalfT, cpOutL.getValue(2));
25724 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25726 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25727 MVT::i32, cpOutH.getValue(2));
25728 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25729 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25731 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25732 Results.push_back(Success);
25733 Results.push_back(EFLAGS.getValue(1));
25736 case ISD::ATOMIC_SWAP:
25737 case ISD::ATOMIC_LOAD_ADD:
25738 case ISD::ATOMIC_LOAD_SUB:
25739 case ISD::ATOMIC_LOAD_AND:
25740 case ISD::ATOMIC_LOAD_OR:
25741 case ISD::ATOMIC_LOAD_XOR:
25742 case ISD::ATOMIC_LOAD_NAND:
25743 case ISD::ATOMIC_LOAD_MIN:
25744 case ISD::ATOMIC_LOAD_MAX:
25745 case ISD::ATOMIC_LOAD_UMIN:
25746 case ISD::ATOMIC_LOAD_UMAX:
25747 case ISD::ATOMIC_LOAD: {
25748 // Delegate to generic TypeLegalization. Situations we can really handle
25749 // should have already been dealt with by AtomicExpandPass.cpp.
25752 case ISD::BITCAST: {
25753 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25754 EVT DstVT = N->getValueType(0);
25755 EVT SrcVT = N->getOperand(0).getValueType();
25757 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25758 // we can split using the k-register rather than memory.
25759 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25760 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25762 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25763 Lo = DAG.getBitcast(MVT::i32, Lo);
25764 Hi = DAG.getBitcast(MVT::i32, Hi);
25765 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25766 Results.push_back(Res);
25770 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25771 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25772 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25774 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25775 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
25776 Lo = DAG.getBitcast(CastVT, Lo);
25777 Hi = DAG.getBitcast(CastVT, Hi);
25778 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25779 Results.push_back(Res);
25783 if (SrcVT != MVT::f64 ||
25784 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25787 unsigned NumElts = DstVT.getVectorNumElements();
25788 EVT SVT = DstVT.getVectorElementType();
25789 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25790 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25791 MVT::v2f64, N->getOperand(0));
25792 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25794 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25795 // If we are legalizing vectors by widening, we already have the desired
25796 // legal vector type, just return it.
25797 Results.push_back(ToVecInt);
25801 SmallVector<SDValue, 8> Elts;
25802 for (unsigned i = 0, e = NumElts; i != e; ++i)
25803 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25804 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25806 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25809 case ISD::MGATHER: {
25810 EVT VT = N->getValueType(0);
25811 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25812 auto *Gather = cast<MaskedGatherSDNode>(N);
25813 SDValue Index = Gather->getIndex();
25814 if (Index.getValueType() != MVT::v2i64)
25816 SDValue Mask = Gather->getMask();
25817 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25818 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25819 Gather->getValue(),
25820 DAG.getUNDEF(MVT::v2f32));
25821 if (!Subtarget.hasVLX()) {
25822 // We need to widen the mask, but the instruction will only use 2
25823 // of its elements. So we can use undef.
25824 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25825 DAG.getUNDEF(MVT::v2i1));
25826 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25828 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25829 Index, Gather->getScale() };
25830 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25831 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25832 Gather->getMemoryVT(), Gather->getMemOperand());
25833 Results.push_back(Res);
25834 Results.push_back(Res.getValue(2));
25837 if (VT == MVT::v2i32) {
25838 auto *Gather = cast<MaskedGatherSDNode>(N);
25839 SDValue Index = Gather->getIndex();
25840 SDValue Mask = Gather->getMask();
25841 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25842 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25843 Gather->getValue(),
25844 DAG.getUNDEF(MVT::v2i32));
25845 // If the index is v2i64 we can use it directly.
25846 if (Index.getValueType() == MVT::v2i64 &&
25847 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25848 if (!Subtarget.hasVLX()) {
25849 // We need to widen the mask, but the instruction will only use 2
25850 // of its elements. So we can use undef.
25851 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25852 DAG.getUNDEF(MVT::v2i1));
25853 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25855 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25856 Index, Gather->getScale() };
25857 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25858 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25859 Gather->getMemoryVT(), Gather->getMemOperand());
25860 SDValue Chain = Res.getValue(2);
25861 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25862 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25863 DAG.getIntPtrConstant(0, dl));
25864 Results.push_back(Res);
25865 Results.push_back(Chain);
25868 EVT IndexVT = Index.getValueType();
25869 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25870 IndexVT.getScalarType(), 4);
25871 // Otherwise we need to custom widen everything to avoid promotion.
25872 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25873 DAG.getUNDEF(IndexVT));
25874 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25875 DAG.getConstant(0, dl, MVT::v2i1));
25876 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25877 Index, Gather->getScale() };
25878 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25879 Gather->getMemoryVT(), dl, Ops,
25880 Gather->getMemOperand());
25881 SDValue Chain = Res.getValue(1);
25882 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25883 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25884 DAG.getIntPtrConstant(0, dl));
25885 Results.push_back(Res);
25886 Results.push_back(Chain);
25894 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25895 switch ((X86ISD::NodeType)Opcode) {
25896 case X86ISD::FIRST_NUMBER: break;
25897 case X86ISD::BSF: return "X86ISD::BSF";
25898 case X86ISD::BSR: return "X86ISD::BSR";
25899 case X86ISD::SHLD: return "X86ISD::SHLD";
25900 case X86ISD::SHRD: return "X86ISD::SHRD";
25901 case X86ISD::FAND: return "X86ISD::FAND";
25902 case X86ISD::FANDN: return "X86ISD::FANDN";
25903 case X86ISD::FOR: return "X86ISD::FOR";
25904 case X86ISD::FXOR: return "X86ISD::FXOR";
25905 case X86ISD::FILD: return "X86ISD::FILD";
25906 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25907 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25908 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25909 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25910 case X86ISD::FLD: return "X86ISD::FLD";
25911 case X86ISD::FST: return "X86ISD::FST";
25912 case X86ISD::CALL: return "X86ISD::CALL";
25913 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25914 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25915 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25916 case X86ISD::BT: return "X86ISD::BT";
25917 case X86ISD::CMP: return "X86ISD::CMP";
25918 case X86ISD::COMI: return "X86ISD::COMI";
25919 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25920 case X86ISD::CMPM: return "X86ISD::CMPM";
25921 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25922 case X86ISD::SETCC: return "X86ISD::SETCC";
25923 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25924 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25925 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25926 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25927 case X86ISD::CMOV: return "X86ISD::CMOV";
25928 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25929 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25930 case X86ISD::IRET: return "X86ISD::IRET";
25931 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25932 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25933 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25934 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25935 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25936 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25937 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25938 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25939 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25940 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25941 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25942 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25943 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25944 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25945 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25946 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25947 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25948 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25949 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25950 case X86ISD::HADD: return "X86ISD::HADD";
25951 case X86ISD::HSUB: return "X86ISD::HSUB";
25952 case X86ISD::FHADD: return "X86ISD::FHADD";
25953 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25954 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25955 case X86ISD::FMAX: return "X86ISD::FMAX";
25956 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25957 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25958 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25959 case X86ISD::FMIN: return "X86ISD::FMIN";
25960 case X86ISD::FMINS: return "X86ISD::FMINS";
25961 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25962 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25963 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25964 case X86ISD::FMINC: return "X86ISD::FMINC";
25965 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25966 case X86ISD::FRCP: return "X86ISD::FRCP";
25967 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25968 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25969 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25970 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25971 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25972 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25973 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25974 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25975 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25976 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25977 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25978 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25979 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25980 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25981 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25982 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25983 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25984 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25985 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25986 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25987 case X86ISD::LADD: return "X86ISD::LADD";
25988 case X86ISD::LSUB: return "X86ISD::LSUB";
25989 case X86ISD::LOR: return "X86ISD::LOR";
25990 case X86ISD::LXOR: return "X86ISD::LXOR";
25991 case X86ISD::LAND: return "X86ISD::LAND";
25992 case X86ISD::LINC: return "X86ISD::LINC";
25993 case X86ISD::LDEC: return "X86ISD::LDEC";
25994 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25995 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25996 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25997 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25998 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25999 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
26000 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
26001 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
26002 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
26003 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
26004 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
26005 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
26006 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
26007 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
26008 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
26009 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
26010 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
26011 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
26012 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
26013 case X86ISD::VSHL: return "X86ISD::VSHL";
26014 case X86ISD::VSRL: return "X86ISD::VSRL";
26015 case X86ISD::VSRA: return "X86ISD::VSRA";
26016 case X86ISD::VSHLI: return "X86ISD::VSHLI";
26017 case X86ISD::VSRLI: return "X86ISD::VSRLI";
26018 case X86ISD::VSRAI: return "X86ISD::VSRAI";
26019 case X86ISD::VSRAV: return "X86ISD::VSRAV";
26020 case X86ISD::VROTLI: return "X86ISD::VROTLI";
26021 case X86ISD::VROTRI: return "X86ISD::VROTRI";
26022 case X86ISD::VPPERM: return "X86ISD::VPPERM";
26023 case X86ISD::CMPP: return "X86ISD::CMPP";
26024 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
26025 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
26026 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
26027 case X86ISD::ADD: return "X86ISD::ADD";
26028 case X86ISD::SUB: return "X86ISD::SUB";
26029 case X86ISD::ADC: return "X86ISD::ADC";
26030 case X86ISD::SBB: return "X86ISD::SBB";
26031 case X86ISD::SMUL: return "X86ISD::SMUL";
26032 case X86ISD::UMUL: return "X86ISD::UMUL";
26033 case X86ISD::SMUL8: return "X86ISD::SMUL8";
26034 case X86ISD::UMUL8: return "X86ISD::UMUL8";
26035 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
26036 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
26037 case X86ISD::INC: return "X86ISD::INC";
26038 case X86ISD::DEC: return "X86ISD::DEC";
26039 case X86ISD::OR: return "X86ISD::OR";
26040 case X86ISD::XOR: return "X86ISD::XOR";
26041 case X86ISD::AND: return "X86ISD::AND";
26042 case X86ISD::BEXTR: return "X86ISD::BEXTR";
26043 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
26044 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
26045 case X86ISD::PTEST: return "X86ISD::PTEST";
26046 case X86ISD::TESTP: return "X86ISD::TESTP";
26047 case X86ISD::KORTEST: return "X86ISD::KORTEST";
26048 case X86ISD::KTEST: return "X86ISD::KTEST";
26049 case X86ISD::KADD: return "X86ISD::KADD";
26050 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
26051 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
26052 case X86ISD::PACKSS: return "X86ISD::PACKSS";
26053 case X86ISD::PACKUS: return "X86ISD::PACKUS";
26054 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
26055 case X86ISD::VALIGN: return "X86ISD::VALIGN";
26056 case X86ISD::VSHLD: return "X86ISD::VSHLD";
26057 case X86ISD::VSHRD: return "X86ISD::VSHRD";
26058 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
26059 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
26060 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
26061 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
26062 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
26063 case X86ISD::SHUFP: return "X86ISD::SHUFP";
26064 case X86ISD::SHUF128: return "X86ISD::SHUF128";
26065 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
26066 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
26067 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
26068 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
26069 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
26070 case X86ISD::MOVSD: return "X86ISD::MOVSD";
26071 case X86ISD::MOVSS: return "X86ISD::MOVSS";
26072 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
26073 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
26074 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
26075 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
26076 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
26077 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
26078 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
26079 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
26080 case X86ISD::VPERMV: return "X86ISD::VPERMV";
26081 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
26082 case X86ISD::VPERMI: return "X86ISD::VPERMI";
26083 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
26084 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
26085 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
26086 case X86ISD::VRANGE: return "X86ISD::VRANGE";
26087 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
26088 case X86ISD::VRANGES: return "X86ISD::VRANGES";
26089 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
26090 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
26091 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
26092 case X86ISD::PSADBW: return "X86ISD::PSADBW";
26093 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
26094 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
26095 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
26096 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
26097 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
26098 case X86ISD::MFENCE: return "X86ISD::MFENCE";
26099 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
26100 case X86ISD::SAHF: return "X86ISD::SAHF";
26101 case X86ISD::RDRAND: return "X86ISD::RDRAND";
26102 case X86ISD::RDSEED: return "X86ISD::RDSEED";
26103 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
26104 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
26105 case X86ISD::VPSHA: return "X86ISD::VPSHA";
26106 case X86ISD::VPSHL: return "X86ISD::VPSHL";
26107 case X86ISD::VPCOM: return "X86ISD::VPCOM";
26108 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
26109 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
26110 case X86ISD::FMSUB: return "X86ISD::FMSUB";
26111 case X86ISD::FNMADD: return "X86ISD::FNMADD";
26112 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
26113 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
26114 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
26115 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
26116 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
26117 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
26118 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
26119 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
26120 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
26121 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
26122 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
26123 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
26124 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
26125 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
26126 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
26127 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
26128 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
26129 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
26130 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
26131 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
26132 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
26133 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
26134 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
26135 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
26136 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
26137 case X86ISD::XTEST: return "X86ISD::XTEST";
26138 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
26139 case X86ISD::EXPAND: return "X86ISD::EXPAND";
26140 case X86ISD::SELECT: return "X86ISD::SELECT";
26141 case X86ISD::SELECTS: return "X86ISD::SELECTS";
26142 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
26143 case X86ISD::RCP14: return "X86ISD::RCP14";
26144 case X86ISD::RCP14S: return "X86ISD::RCP14S";
26145 case X86ISD::RCP28: return "X86ISD::RCP28";
26146 case X86ISD::RCP28S: return "X86ISD::RCP28S";
26147 case X86ISD::EXP2: return "X86ISD::EXP2";
26148 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
26149 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
26150 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
26151 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
26152 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
26153 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
26154 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
26155 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
26156 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
26157 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
26158 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
26159 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
26160 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
26161 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
26162 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
26163 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
26164 case X86ISD::SCALEF: return "X86ISD::SCALEF";
26165 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
26166 case X86ISD::ADDS: return "X86ISD::ADDS";
26167 case X86ISD::SUBS: return "X86ISD::SUBS";
26168 case X86ISD::AVG: return "X86ISD::AVG";
26169 case X86ISD::MULHRS: return "X86ISD::MULHRS";
26170 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
26171 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
26172 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
26173 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
26174 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
26175 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
26176 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
26177 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
26178 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
26179 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
26180 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
26181 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
26182 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
26183 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
26184 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
26185 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
26186 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
26187 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
26188 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
26189 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
26190 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
26191 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
26192 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
26193 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
26194 case X86ISD::LWPINS: return "X86ISD::LWPINS";
26195 case X86ISD::MGATHER: return "X86ISD::MGATHER";
26196 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
26197 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
26198 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
26199 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
26200 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
26201 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
26202 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
26203 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
26204 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
26205 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
26206 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
26207 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
26208 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
26213 /// Return true if the addressing mode represented by AM is legal for this
26214 /// target, for a load/store of the specified type.
26215 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
26216 const AddrMode &AM, Type *Ty,
26218 Instruction *I) const {
26219 // X86 supports extremely general addressing modes.
26220 CodeModel::Model M = getTargetMachine().getCodeModel();
26222 // X86 allows a sign-extended 32-bit immediate field as a displacement.
26223 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
26227 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
26229 // If a reference to this global requires an extra load, we can't fold it.
26230 if (isGlobalStubReference(GVFlags))
26233 // If BaseGV requires a register for the PIC base, we cannot also have a
26234 // BaseReg specified.
26235 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
26238 // If lower 4G is not available, then we must use rip-relative addressing.
26239 if ((M != CodeModel::Small || isPositionIndependent()) &&
26240 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
26244 switch (AM.Scale) {
26250 // These scales always work.
26255 // These scales are formed with basereg+scalereg. Only accept if there is
26260 default: // Other stuff never works.
26267 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
26268 unsigned Bits = Ty->getScalarSizeInBits();
26270 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
26271 // particularly cheaper than those without.
26275 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
26276 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
26277 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
26280 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
26281 // shifts just as cheap as scalar ones.
26282 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
26285 // AVX512BW has shifts such as vpsllvw.
26286 if (Subtarget.hasBWI() && Bits == 16)
26289 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
26290 // fully general vector.
26294 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
26295 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26297 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
26298 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
26299 return NumBits1 > NumBits2;
26302 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
26303 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26306 if (!isTypeLegal(EVT::getEVT(Ty1)))
26309 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
26311 // Assuming the caller doesn't have a zeroext or signext return parameter,
26312 // truncation all the way down to i1 is valid.
26316 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
26317 return isInt<32>(Imm);
26320 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
26321 // Can also use sub to handle negated immediates.
26322 return isInt<32>(Imm);
26325 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
26326 if (!VT1.isInteger() || !VT2.isInteger())
26328 unsigned NumBits1 = VT1.getSizeInBits();
26329 unsigned NumBits2 = VT2.getSizeInBits();
26330 return NumBits1 > NumBits2;
26333 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
26334 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26335 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
26338 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
26339 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26340 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
26343 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
26344 EVT VT1 = Val.getValueType();
26345 if (isZExtFree(VT1, VT2))
26348 if (Val.getOpcode() != ISD::LOAD)
26351 if (!VT1.isSimple() || !VT1.isInteger() ||
26352 !VT2.isSimple() || !VT2.isInteger())
26355 switch (VT1.getSimpleVT().SimpleTy) {
26360 // X86 has 8, 16, and 32-bit zero-extending loads.
26367 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26368 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26370 // There is no extending load for vXi1.
26371 if (SrcVT.getScalarType() == MVT::i1)
26378 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26379 if (!Subtarget.hasAnyFMA())
26382 VT = VT.getScalarType();
26384 if (!VT.isSimple())
26387 switch (VT.getSimpleVT().SimpleTy) {
26398 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26399 // i16 instructions are longer (0x66 prefix) and potentially slower.
26400 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26403 /// Targets can use this to indicate that they only support *some*
26404 /// VECTOR_SHUFFLE operations, those with specific masks.
26405 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26406 /// are assumed to be legal.
26407 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26408 if (!VT.isSimple())
26411 // Not for i1 vectors
26412 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26415 // Very little shuffling can be done for 64-bit vectors right now.
26416 if (VT.getSimpleVT().getSizeInBits() == 64)
26419 // We only care that the types being shuffled are legal. The lowering can
26420 // handle any possible shuffle mask that results.
26421 return isTypeLegal(VT.getSimpleVT());
26424 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
26426 // Don't convert an 'and' into a shuffle that we don't directly support.
26427 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
26428 if (!Subtarget.hasAVX2())
26429 if (VT == MVT::v32i8 || VT == MVT::v16i16)
26432 // Just delegate to the generic legality, clear masks aren't special.
26433 return isShuffleMaskLegal(Mask, VT);
26436 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26437 // If the subtarget is using retpolines, we need to not generate jump tables.
26438 if (Subtarget.useRetpoline())
26441 // Otherwise, fallback on the generic logic.
26442 return TargetLowering::areJTsAllowed(Fn);
26445 //===----------------------------------------------------------------------===//
26446 // X86 Scheduler Hooks
26447 //===----------------------------------------------------------------------===//
26449 /// Utility function to emit xbegin specifying the start of an RTM region.
26450 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26451 const TargetInstrInfo *TII) {
26452 DebugLoc DL = MI.getDebugLoc();
26454 const BasicBlock *BB = MBB->getBasicBlock();
26455 MachineFunction::iterator I = ++MBB->getIterator();
26457 // For the v = xbegin(), we generate
26466 // eax = # XABORT_DEF
26470 // v = phi(s0/mainBB, s1/fallBB)
26472 MachineBasicBlock *thisMBB = MBB;
26473 MachineFunction *MF = MBB->getParent();
26474 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26475 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26476 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26477 MF->insert(I, mainMBB);
26478 MF->insert(I, fallMBB);
26479 MF->insert(I, sinkMBB);
26481 // Transfer the remainder of BB and its successor edges to sinkMBB.
26482 sinkMBB->splice(sinkMBB->begin(), MBB,
26483 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26484 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26486 MachineRegisterInfo &MRI = MF->getRegInfo();
26487 unsigned DstReg = MI.getOperand(0).getReg();
26488 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26489 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26490 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26494 // # fallthrough to mainMBB
26495 // # abortion to fallMBB
26496 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26497 thisMBB->addSuccessor(mainMBB);
26498 thisMBB->addSuccessor(fallMBB);
26501 // mainDstReg := -1
26502 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26503 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26504 mainMBB->addSuccessor(sinkMBB);
26507 // ; pseudo instruction to model hardware's definition from XABORT
26508 // EAX := XABORT_DEF
26509 // fallDstReg := EAX
26510 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26511 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26513 fallMBB->addSuccessor(sinkMBB);
26516 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26517 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26518 .addReg(mainDstReg).addMBB(mainMBB)
26519 .addReg(fallDstReg).addMBB(fallMBB);
26521 MI.eraseFromParent();
26525 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26526 const X86Subtarget &Subtarget) {
26527 DebugLoc dl = MI.getDebugLoc();
26528 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26530 // insert input VAL into EAX
26531 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26532 .addReg(MI.getOperand(0).getReg());
26533 // insert zero to ECX
26534 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26536 // insert zero to EDX
26537 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26539 // insert WRPKRU instruction
26540 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26542 MI.eraseFromParent(); // The pseudo is gone now.
26546 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26547 const X86Subtarget &Subtarget) {
26548 DebugLoc dl = MI.getDebugLoc();
26549 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26551 // insert zero to ECX
26552 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26554 // insert RDPKRU instruction
26555 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26556 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26559 MI.eraseFromParent(); // The pseudo is gone now.
26563 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26564 const X86Subtarget &Subtarget,
26566 DebugLoc dl = MI.getDebugLoc();
26567 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26568 // Address into RAX/EAX, other two args into ECX, EDX.
26569 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26570 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26571 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26572 for (int i = 0; i < X86::AddrNumOperands; ++i)
26573 MIB.add(MI.getOperand(i));
26575 unsigned ValOps = X86::AddrNumOperands;
26576 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26577 .addReg(MI.getOperand(ValOps).getReg());
26578 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26579 .addReg(MI.getOperand(ValOps + 1).getReg());
26581 // The instruction doesn't actually take any operands though.
26582 BuildMI(*BB, MI, dl, TII->get(Opc));
26584 MI.eraseFromParent(); // The pseudo is gone now.
26588 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26589 const X86Subtarget &Subtarget) {
26590 DebugLoc dl = MI->getDebugLoc();
26591 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26592 // Address into RAX/EAX
26593 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26594 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26595 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26596 for (int i = 0; i < X86::AddrNumOperands; ++i)
26597 MIB.add(MI->getOperand(i));
26599 // The instruction doesn't actually take any operands though.
26600 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26602 MI->eraseFromParent(); // The pseudo is gone now.
26608 MachineBasicBlock *
26609 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26610 MachineBasicBlock *MBB) const {
26611 // Emit va_arg instruction on X86-64.
26613 // Operands to this pseudo-instruction:
26614 // 0 ) Output : destination address (reg)
26615 // 1-5) Input : va_list address (addr, i64mem)
26616 // 6 ) ArgSize : Size (in bytes) of vararg type
26617 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26618 // 8 ) Align : Alignment of type
26619 // 9 ) EFLAGS (implicit-def)
26621 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26622 static_assert(X86::AddrNumOperands == 5,
26623 "VAARG_64 assumes 5 address operands");
26625 unsigned DestReg = MI.getOperand(0).getReg();
26626 MachineOperand &Base = MI.getOperand(1);
26627 MachineOperand &Scale = MI.getOperand(2);
26628 MachineOperand &Index = MI.getOperand(3);
26629 MachineOperand &Disp = MI.getOperand(4);
26630 MachineOperand &Segment = MI.getOperand(5);
26631 unsigned ArgSize = MI.getOperand(6).getImm();
26632 unsigned ArgMode = MI.getOperand(7).getImm();
26633 unsigned Align = MI.getOperand(8).getImm();
26635 // Memory Reference
26636 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26637 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26638 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26640 // Machine Information
26641 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26642 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26643 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26644 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26645 DebugLoc DL = MI.getDebugLoc();
26647 // struct va_list {
26650 // i64 overflow_area (address)
26651 // i64 reg_save_area (address)
26653 // sizeof(va_list) = 24
26654 // alignment(va_list) = 8
26656 unsigned TotalNumIntRegs = 6;
26657 unsigned TotalNumXMMRegs = 8;
26658 bool UseGPOffset = (ArgMode == 1);
26659 bool UseFPOffset = (ArgMode == 2);
26660 unsigned MaxOffset = TotalNumIntRegs * 8 +
26661 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26663 /* Align ArgSize to a multiple of 8 */
26664 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26665 bool NeedsAlign = (Align > 8);
26667 MachineBasicBlock *thisMBB = MBB;
26668 MachineBasicBlock *overflowMBB;
26669 MachineBasicBlock *offsetMBB;
26670 MachineBasicBlock *endMBB;
26672 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26673 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26674 unsigned OffsetReg = 0;
26676 if (!UseGPOffset && !UseFPOffset) {
26677 // If we only pull from the overflow region, we don't create a branch.
26678 // We don't need to alter control flow.
26679 OffsetDestReg = 0; // unused
26680 OverflowDestReg = DestReg;
26682 offsetMBB = nullptr;
26683 overflowMBB = thisMBB;
26686 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26687 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26688 // If not, pull from overflow_area. (branch to overflowMBB)
26693 // offsetMBB overflowMBB
26698 // Registers for the PHI in endMBB
26699 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26700 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26702 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26703 MachineFunction *MF = MBB->getParent();
26704 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26705 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26706 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26708 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26710 // Insert the new basic blocks
26711 MF->insert(MBBIter, offsetMBB);
26712 MF->insert(MBBIter, overflowMBB);
26713 MF->insert(MBBIter, endMBB);
26715 // Transfer the remainder of MBB and its successor edges to endMBB.
26716 endMBB->splice(endMBB->begin(), thisMBB,
26717 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26718 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26720 // Make offsetMBB and overflowMBB successors of thisMBB
26721 thisMBB->addSuccessor(offsetMBB);
26722 thisMBB->addSuccessor(overflowMBB);
26724 // endMBB is a successor of both offsetMBB and overflowMBB
26725 offsetMBB->addSuccessor(endMBB);
26726 overflowMBB->addSuccessor(endMBB);
26728 // Load the offset value into a register
26729 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26730 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26734 .addDisp(Disp, UseFPOffset ? 4 : 0)
26736 .setMemRefs(MMOBegin, MMOEnd);
26738 // Check if there is enough room left to pull this argument.
26739 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26741 .addImm(MaxOffset + 8 - ArgSizeA8);
26743 // Branch to "overflowMBB" if offset >= max
26744 // Fall through to "offsetMBB" otherwise
26745 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26746 .addMBB(overflowMBB);
26749 // In offsetMBB, emit code to use the reg_save_area.
26751 assert(OffsetReg != 0);
26753 // Read the reg_save_area address.
26754 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26755 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26761 .setMemRefs(MMOBegin, MMOEnd);
26763 // Zero-extend the offset
26764 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26765 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26768 .addImm(X86::sub_32bit);
26770 // Add the offset to the reg_save_area to get the final address.
26771 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26772 .addReg(OffsetReg64)
26773 .addReg(RegSaveReg);
26775 // Compute the offset for the next argument
26776 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26777 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26779 .addImm(UseFPOffset ? 16 : 8);
26781 // Store it back into the va_list.
26782 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26786 .addDisp(Disp, UseFPOffset ? 4 : 0)
26788 .addReg(NextOffsetReg)
26789 .setMemRefs(MMOBegin, MMOEnd);
26792 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26797 // Emit code to use overflow area
26800 // Load the overflow_area address into a register.
26801 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26802 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26808 .setMemRefs(MMOBegin, MMOEnd);
26810 // If we need to align it, do so. Otherwise, just copy the address
26811 // to OverflowDestReg.
26813 // Align the overflow address
26814 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26815 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26817 // aligned_addr = (addr + (align-1)) & ~(align-1)
26818 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26819 .addReg(OverflowAddrReg)
26822 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26824 .addImm(~(uint64_t)(Align-1));
26826 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26827 .addReg(OverflowAddrReg);
26830 // Compute the next overflow address after this argument.
26831 // (the overflow address should be kept 8-byte aligned)
26832 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26833 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26834 .addReg(OverflowDestReg)
26835 .addImm(ArgSizeA8);
26837 // Store the new overflow address.
26838 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26844 .addReg(NextAddrReg)
26845 .setMemRefs(MMOBegin, MMOEnd);
26847 // If we branched, emit the PHI to the front of endMBB.
26849 BuildMI(*endMBB, endMBB->begin(), DL,
26850 TII->get(X86::PHI), DestReg)
26851 .addReg(OffsetDestReg).addMBB(offsetMBB)
26852 .addReg(OverflowDestReg).addMBB(overflowMBB);
26855 // Erase the pseudo instruction
26856 MI.eraseFromParent();
26861 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26862 MachineInstr &MI, MachineBasicBlock *MBB) const {
26863 // Emit code to save XMM registers to the stack. The ABI says that the
26864 // number of registers to save is given in %al, so it's theoretically
26865 // possible to do an indirect jump trick to avoid saving all of them,
26866 // however this code takes a simpler approach and just executes all
26867 // of the stores if %al is non-zero. It's less code, and it's probably
26868 // easier on the hardware branch predictor, and stores aren't all that
26869 // expensive anyway.
26871 // Create the new basic blocks. One block contains all the XMM stores,
26872 // and one block is the final destination regardless of whether any
26873 // stores were performed.
26874 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26875 MachineFunction *F = MBB->getParent();
26876 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26877 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26878 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26879 F->insert(MBBIter, XMMSaveMBB);
26880 F->insert(MBBIter, EndMBB);
26882 // Transfer the remainder of MBB and its successor edges to EndMBB.
26883 EndMBB->splice(EndMBB->begin(), MBB,
26884 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26885 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26887 // The original block will now fall through to the XMM save block.
26888 MBB->addSuccessor(XMMSaveMBB);
26889 // The XMMSaveMBB will fall through to the end block.
26890 XMMSaveMBB->addSuccessor(EndMBB);
26892 // Now add the instructions.
26893 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26894 DebugLoc DL = MI.getDebugLoc();
26896 unsigned CountReg = MI.getOperand(0).getReg();
26897 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26898 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26900 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26901 // If %al is 0, branch around the XMM save block.
26902 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26903 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26904 MBB->addSuccessor(EndMBB);
26907 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26908 // that was just emitted, but clearly shouldn't be "saved".
26909 assert((MI.getNumOperands() <= 3 ||
26910 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26911 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26912 "Expected last argument to be EFLAGS");
26913 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26914 // In the XMM save block, save all the XMM argument registers.
26915 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26916 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26917 MachineMemOperand *MMO = F->getMachineMemOperand(
26918 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26919 MachineMemOperand::MOStore,
26920 /*Size=*/16, /*Align=*/16);
26921 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26922 .addFrameIndex(RegSaveFrameIndex)
26923 .addImm(/*Scale=*/1)
26924 .addReg(/*IndexReg=*/0)
26925 .addImm(/*Disp=*/Offset)
26926 .addReg(/*Segment=*/0)
26927 .addReg(MI.getOperand(i).getReg())
26928 .addMemOperand(MMO);
26931 MI.eraseFromParent(); // The pseudo instruction is gone now.
26936 // The EFLAGS operand of SelectItr might be missing a kill marker
26937 // because there were multiple uses of EFLAGS, and ISel didn't know
26938 // which to mark. Figure out whether SelectItr should have had a
26939 // kill marker, and set it if it should. Returns the correct kill
26941 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26942 MachineBasicBlock* BB,
26943 const TargetRegisterInfo* TRI) {
26944 // Scan forward through BB for a use/def of EFLAGS.
26945 MachineBasicBlock::iterator miI(std::next(SelectItr));
26946 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26947 const MachineInstr& mi = *miI;
26948 if (mi.readsRegister(X86::EFLAGS))
26950 if (mi.definesRegister(X86::EFLAGS))
26951 break; // Should have kill-flag - update below.
26954 // If we hit the end of the block, check whether EFLAGS is live into a
26956 if (miI == BB->end()) {
26957 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26958 sEnd = BB->succ_end();
26959 sItr != sEnd; ++sItr) {
26960 MachineBasicBlock* succ = *sItr;
26961 if (succ->isLiveIn(X86::EFLAGS))
26966 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26967 // out. SelectMI should have a kill flag on EFLAGS.
26968 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26972 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26973 // together with other CMOV pseudo-opcodes into a single basic-block with
26974 // conditional jump around it.
26975 static bool isCMOVPseudo(MachineInstr &MI) {
26976 switch (MI.getOpcode()) {
26977 case X86::CMOV_FR32:
26978 case X86::CMOV_FR64:
26979 case X86::CMOV_GR8:
26980 case X86::CMOV_GR16:
26981 case X86::CMOV_GR32:
26982 case X86::CMOV_RFP32:
26983 case X86::CMOV_RFP64:
26984 case X86::CMOV_RFP80:
26985 case X86::CMOV_V2F64:
26986 case X86::CMOV_V2I64:
26987 case X86::CMOV_V4F32:
26988 case X86::CMOV_V4F64:
26989 case X86::CMOV_V4I64:
26990 case X86::CMOV_V16F32:
26991 case X86::CMOV_V8F32:
26992 case X86::CMOV_V8F64:
26993 case X86::CMOV_V8I64:
26994 case X86::CMOV_V8I1:
26995 case X86::CMOV_V16I1:
26996 case X86::CMOV_V32I1:
26997 case X86::CMOV_V64I1:
27005 // Helper function, which inserts PHI functions into SinkMBB:
27006 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
27007 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
27008 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
27009 // the last PHI function inserted.
27010 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
27011 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
27012 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
27013 MachineBasicBlock *SinkMBB) {
27014 MachineFunction *MF = TrueMBB->getParent();
27015 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
27016 DebugLoc DL = MIItBegin->getDebugLoc();
27018 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
27019 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27021 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
27023 // As we are creating the PHIs, we have to be careful if there is more than
27024 // one. Later CMOVs may reference the results of earlier CMOVs, but later
27025 // PHIs have to reference the individual true/false inputs from earlier PHIs.
27026 // That also means that PHI construction must work forward from earlier to
27027 // later, and that the code must maintain a mapping from earlier PHI's
27028 // destination registers, and the registers that went into the PHI.
27029 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
27030 MachineInstrBuilder MIB;
27032 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
27033 unsigned DestReg = MIIt->getOperand(0).getReg();
27034 unsigned Op1Reg = MIIt->getOperand(1).getReg();
27035 unsigned Op2Reg = MIIt->getOperand(2).getReg();
27037 // If this CMOV we are generating is the opposite condition from
27038 // the jump we generated, then we have to swap the operands for the
27039 // PHI that is going to be generated.
27040 if (MIIt->getOperand(3).getImm() == OppCC)
27041 std::swap(Op1Reg, Op2Reg);
27043 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
27044 Op1Reg = RegRewriteTable[Op1Reg].first;
27046 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
27047 Op2Reg = RegRewriteTable[Op2Reg].second;
27049 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
27055 // Add this PHI to the rewrite table.
27056 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
27062 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
27063 MachineBasicBlock *
27064 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
27065 MachineInstr &SecondCascadedCMOV,
27066 MachineBasicBlock *ThisMBB) const {
27067 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27068 DebugLoc DL = FirstCMOV.getDebugLoc();
27070 // We lower cascaded CMOVs such as
27072 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
27074 // to two successive branches.
27076 // Without this, we would add a PHI between the two jumps, which ends up
27077 // creating a few copies all around. For instance, for
27079 // (sitofp (zext (fcmp une)))
27081 // we would generate:
27083 // ucomiss %xmm1, %xmm0
27084 // movss <1.0f>, %xmm0
27085 // movaps %xmm0, %xmm1
27087 // xorps %xmm1, %xmm1
27090 // movaps %xmm1, %xmm0
27094 // because this custom-inserter would have generated:
27106 // A: X = ...; Y = ...
27108 // C: Z = PHI [X, A], [Y, B]
27110 // E: PHI [X, C], [Z, D]
27112 // If we lower both CMOVs in a single step, we can instead generate:
27124 // A: X = ...; Y = ...
27126 // E: PHI [X, A], [X, C], [Y, D]
27128 // Which, in our sitofp/fcmp example, gives us something like:
27130 // ucomiss %xmm1, %xmm0
27131 // movss <1.0f>, %xmm0
27134 // xorps %xmm0, %xmm0
27139 // We lower cascaded CMOV into two successive branches to the same block.
27140 // EFLAGS is used by both, so mark it as live in the second.
27141 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27142 MachineFunction *F = ThisMBB->getParent();
27143 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27144 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27145 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27147 MachineFunction::iterator It = ++ThisMBB->getIterator();
27148 F->insert(It, FirstInsertedMBB);
27149 F->insert(It, SecondInsertedMBB);
27150 F->insert(It, SinkMBB);
27152 // For a cascaded CMOV, we lower it to two successive branches to
27153 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
27154 // the FirstInsertedMBB.
27155 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
27157 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27158 // live into the sink and copy blocks.
27159 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27160 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
27161 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
27162 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
27163 SinkMBB->addLiveIn(X86::EFLAGS);
27166 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27167 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27168 std::next(MachineBasicBlock::iterator(FirstCMOV)),
27170 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27172 // Fallthrough block for ThisMBB.
27173 ThisMBB->addSuccessor(FirstInsertedMBB);
27174 // The true block target of the first branch is always SinkMBB.
27175 ThisMBB->addSuccessor(SinkMBB);
27176 // Fallthrough block for FirstInsertedMBB.
27177 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
27178 // The true block for the branch of FirstInsertedMBB.
27179 FirstInsertedMBB->addSuccessor(SinkMBB);
27180 // This is fallthrough.
27181 SecondInsertedMBB->addSuccessor(SinkMBB);
27183 // Create the conditional branch instructions.
27184 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
27185 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
27186 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27188 X86::CondCode SecondCC =
27189 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
27190 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
27191 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
27194 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
27195 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
27196 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
27197 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
27198 MachineInstrBuilder MIB =
27199 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
27201 .addMBB(SecondInsertedMBB)
27205 // The second SecondInsertedMBB provides the same incoming value as the
27206 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
27207 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
27208 // Copy the PHI result to the register defined by the second CMOV.
27209 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
27210 TII->get(TargetOpcode::COPY),
27211 SecondCascadedCMOV.getOperand(0).getReg())
27212 .addReg(FirstCMOV.getOperand(0).getReg());
27214 // Now remove the CMOVs.
27215 FirstCMOV.eraseFromParent();
27216 SecondCascadedCMOV.eraseFromParent();
27221 MachineBasicBlock *
27222 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
27223 MachineBasicBlock *ThisMBB) const {
27224 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27225 DebugLoc DL = MI.getDebugLoc();
27227 // To "insert" a SELECT_CC instruction, we actually have to insert the
27228 // diamond control-flow pattern. The incoming instruction knows the
27229 // destination vreg to set, the condition code register to branch on, the
27230 // true/false values to select between and a branch opcode to use.
27235 // cmpTY ccX, r1, r2
27237 // fallthrough --> FalseMBB
27239 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
27240 // as described above, by inserting a BB, and then making a PHI at the join
27241 // point to select the true and false operands of the CMOV in the PHI.
27243 // The code also handles two different cases of multiple CMOV opcodes
27247 // In this case, there are multiple CMOVs in a row, all which are based on
27248 // the same condition setting (or the exact opposite condition setting).
27249 // In this case we can lower all the CMOVs using a single inserted BB, and
27250 // then make a number of PHIs at the join point to model the CMOVs. The only
27251 // trickiness here, is that in a case like:
27253 // t2 = CMOV cond1 t1, f1
27254 // t3 = CMOV cond1 t2, f2
27256 // when rewriting this into PHIs, we have to perform some renaming on the
27257 // temps since you cannot have a PHI operand refer to a PHI result earlier
27258 // in the same block. The "simple" but wrong lowering would be:
27260 // t2 = PHI t1(BB1), f1(BB2)
27261 // t3 = PHI t2(BB1), f2(BB2)
27263 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
27264 // renaming is to note that on the path through BB1, t2 is really just a
27265 // copy of t1, and do that renaming, properly generating:
27267 // t2 = PHI t1(BB1), f1(BB2)
27268 // t3 = PHI t1(BB1), f2(BB2)
27271 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
27272 // function - EmitLoweredCascadedSelect.
27274 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
27275 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27276 MachineInstr *LastCMOV = &MI;
27277 MachineBasicBlock::iterator NextMIIt =
27278 std::next(MachineBasicBlock::iterator(MI));
27280 // Check for case 1, where there are multiple CMOVs with the same condition
27281 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
27282 // number of jumps the most.
27284 if (isCMOVPseudo(MI)) {
27285 // See if we have a string of CMOVS with the same condition.
27286 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27287 (NextMIIt->getOperand(3).getImm() == CC ||
27288 NextMIIt->getOperand(3).getImm() == OppCC)) {
27289 LastCMOV = &*NextMIIt;
27294 // This checks for case 2, but only do this if we didn't already find
27295 // case 1, as indicated by LastCMOV == MI.
27296 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27297 NextMIIt->getOpcode() == MI.getOpcode() &&
27298 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27299 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27300 NextMIIt->getOperand(1).isKill()) {
27301 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27304 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27305 MachineFunction *F = ThisMBB->getParent();
27306 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27307 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27309 MachineFunction::iterator It = ++ThisMBB->getIterator();
27310 F->insert(It, FalseMBB);
27311 F->insert(It, SinkMBB);
27313 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27314 // live into the sink and copy blocks.
27315 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27316 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27317 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27318 FalseMBB->addLiveIn(X86::EFLAGS);
27319 SinkMBB->addLiveIn(X86::EFLAGS);
27322 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27323 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27324 std::next(MachineBasicBlock::iterator(LastCMOV)),
27326 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27328 // Fallthrough block for ThisMBB.
27329 ThisMBB->addSuccessor(FalseMBB);
27330 // The true block target of the first (or only) branch is always a SinkMBB.
27331 ThisMBB->addSuccessor(SinkMBB);
27332 // Fallthrough block for FalseMBB.
27333 FalseMBB->addSuccessor(SinkMBB);
27335 // Create the conditional branch instruction.
27336 unsigned Opc = X86::GetCondBranchFromCond(CC);
27337 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27340 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27342 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27343 MachineBasicBlock::iterator MIItEnd =
27344 std::next(MachineBasicBlock::iterator(LastCMOV));
27345 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27347 // Now remove the CMOV(s).
27348 ThisMBB->erase(MIItBegin, MIItEnd);
27353 MachineBasicBlock *
27354 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27355 MachineBasicBlock *BB) const {
27356 // Combine the following atomic floating-point modification pattern:
27357 // a.store(reg OP a.load(acquire), release)
27358 // Transform them into:
27359 // OPss (%gpr), %xmm
27360 // movss %xmm, (%gpr)
27361 // Or sd equivalent for 64-bit operations.
27363 switch (MI.getOpcode()) {
27364 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27365 case X86::RELEASE_FADD32mr:
27366 FOp = X86::ADDSSrm;
27367 MOp = X86::MOVSSmr;
27369 case X86::RELEASE_FADD64mr:
27370 FOp = X86::ADDSDrm;
27371 MOp = X86::MOVSDmr;
27374 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27375 DebugLoc DL = MI.getDebugLoc();
27376 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27377 unsigned ValOpIdx = X86::AddrNumOperands;
27378 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27379 MachineInstrBuilder MIB =
27380 BuildMI(*BB, MI, DL, TII->get(FOp),
27381 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27383 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27384 MachineOperand &Operand = MI.getOperand(i);
27385 // Clear any kill flags on register operands as we'll create a second
27386 // instruction using the same address operands.
27387 if (Operand.isReg())
27388 Operand.setIsKill(false);
27391 MachineInstr *FOpMI = MIB;
27392 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27393 for (int i = 0; i < X86::AddrNumOperands; ++i)
27394 MIB.add(MI.getOperand(i));
27395 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27396 MI.eraseFromParent(); // The pseudo instruction is gone now.
27400 MachineBasicBlock *
27401 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27402 MachineBasicBlock *BB) const {
27403 MachineFunction *MF = BB->getParent();
27404 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27405 DebugLoc DL = MI.getDebugLoc();
27406 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27408 assert(MF->shouldSplitStack());
27410 const bool Is64Bit = Subtarget.is64Bit();
27411 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27413 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27414 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27417 // ... [Till the alloca]
27418 // If stacklet is not large enough, jump to mallocMBB
27421 // Allocate by subtracting from RSP
27422 // Jump to continueMBB
27425 // Allocate by call to runtime
27429 // [rest of original BB]
27432 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27433 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27434 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27436 MachineRegisterInfo &MRI = MF->getRegInfo();
27437 const TargetRegisterClass *AddrRegClass =
27438 getRegClassFor(getPointerTy(MF->getDataLayout()));
27440 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27441 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27442 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27443 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27444 sizeVReg = MI.getOperand(1).getReg(),
27446 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27448 MachineFunction::iterator MBBIter = ++BB->getIterator();
27450 MF->insert(MBBIter, bumpMBB);
27451 MF->insert(MBBIter, mallocMBB);
27452 MF->insert(MBBIter, continueMBB);
27454 continueMBB->splice(continueMBB->begin(), BB,
27455 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27456 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27458 // Add code to the main basic block to check if the stack limit has been hit,
27459 // and if so, jump to mallocMBB otherwise to bumpMBB.
27460 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27461 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27462 .addReg(tmpSPVReg).addReg(sizeVReg);
27463 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27464 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27465 .addReg(SPLimitVReg);
27466 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27468 // bumpMBB simply decreases the stack pointer, since we know the current
27469 // stacklet has enough space.
27470 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27471 .addReg(SPLimitVReg);
27472 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27473 .addReg(SPLimitVReg);
27474 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27476 // Calls into a routine in libgcc to allocate more space from the heap.
27477 const uint32_t *RegMask =
27478 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27480 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27482 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27483 .addExternalSymbol("__morestack_allocate_stack_space")
27484 .addRegMask(RegMask)
27485 .addReg(X86::RDI, RegState::Implicit)
27486 .addReg(X86::RAX, RegState::ImplicitDefine);
27487 } else if (Is64Bit) {
27488 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27490 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27491 .addExternalSymbol("__morestack_allocate_stack_space")
27492 .addRegMask(RegMask)
27493 .addReg(X86::EDI, RegState::Implicit)
27494 .addReg(X86::EAX, RegState::ImplicitDefine);
27496 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27498 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27499 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27500 .addExternalSymbol("__morestack_allocate_stack_space")
27501 .addRegMask(RegMask)
27502 .addReg(X86::EAX, RegState::ImplicitDefine);
27506 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27509 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27510 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27511 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27513 // Set up the CFG correctly.
27514 BB->addSuccessor(bumpMBB);
27515 BB->addSuccessor(mallocMBB);
27516 mallocMBB->addSuccessor(continueMBB);
27517 bumpMBB->addSuccessor(continueMBB);
27519 // Take care of the PHI nodes.
27520 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27521 MI.getOperand(0).getReg())
27522 .addReg(mallocPtrVReg)
27524 .addReg(bumpSPPtrVReg)
27527 // Delete the original pseudo instruction.
27528 MI.eraseFromParent();
27531 return continueMBB;
27534 MachineBasicBlock *
27535 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27536 MachineBasicBlock *BB) const {
27537 MachineFunction *MF = BB->getParent();
27538 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27539 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27540 DebugLoc DL = MI.getDebugLoc();
27542 assert(!isAsynchronousEHPersonality(
27543 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27544 "SEH does not use catchret!");
27546 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27547 if (!Subtarget.is32Bit())
27550 // C++ EH creates a new target block to hold the restore code, and wires up
27551 // the new block to the return destination with a normal JMP_4.
27552 MachineBasicBlock *RestoreMBB =
27553 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27554 assert(BB->succ_size() == 1);
27555 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27556 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27557 BB->addSuccessor(RestoreMBB);
27558 MI.getOperand(0).setMBB(RestoreMBB);
27560 auto RestoreMBBI = RestoreMBB->begin();
27561 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27562 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27566 MachineBasicBlock *
27567 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27568 MachineBasicBlock *BB) const {
27569 MachineFunction *MF = BB->getParent();
27570 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27571 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27572 // Only 32-bit SEH requires special handling for catchpad.
27573 if (IsSEH && Subtarget.is32Bit()) {
27574 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27575 DebugLoc DL = MI.getDebugLoc();
27576 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27578 MI.eraseFromParent();
27582 MachineBasicBlock *
27583 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27584 MachineBasicBlock *BB) const {
27585 // So, here we replace TLSADDR with the sequence:
27586 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27587 // We need this because TLSADDR is lowered into calls
27588 // inside MC, therefore without the two markers shrink-wrapping
27589 // may push the prologue/epilogue pass them.
27590 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27591 DebugLoc DL = MI.getDebugLoc();
27592 MachineFunction &MF = *BB->getParent();
27594 // Emit CALLSEQ_START right before the instruction.
27595 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27596 MachineInstrBuilder CallseqStart =
27597 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27598 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27600 // Emit CALLSEQ_END right after the instruction.
27601 // We don't call erase from parent because we want to keep the
27602 // original instruction around.
27603 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27604 MachineInstrBuilder CallseqEnd =
27605 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27606 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27611 MachineBasicBlock *
27612 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27613 MachineBasicBlock *BB) const {
27614 // This is pretty easy. We're taking the value that we received from
27615 // our load from the relocation, sticking it in either RDI (x86-64)
27616 // or EAX and doing an indirect call. The return value will then
27617 // be in the normal return register.
27618 MachineFunction *F = BB->getParent();
27619 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27620 DebugLoc DL = MI.getDebugLoc();
27622 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27623 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27625 // Get a register mask for the lowered call.
27626 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27627 // proper register mask.
27628 const uint32_t *RegMask =
27629 Subtarget.is64Bit() ?
27630 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27631 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27632 if (Subtarget.is64Bit()) {
27633 MachineInstrBuilder MIB =
27634 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27638 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27639 MI.getOperand(3).getTargetFlags())
27641 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27642 addDirectMem(MIB, X86::RDI);
27643 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27644 } else if (!isPositionIndependent()) {
27645 MachineInstrBuilder MIB =
27646 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27650 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27651 MI.getOperand(3).getTargetFlags())
27653 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27654 addDirectMem(MIB, X86::EAX);
27655 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27657 MachineInstrBuilder MIB =
27658 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27659 .addReg(TII->getGlobalBaseReg(F))
27662 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27663 MI.getOperand(3).getTargetFlags())
27665 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27666 addDirectMem(MIB, X86::EAX);
27667 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27670 MI.eraseFromParent(); // The pseudo instruction is gone now.
27674 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27676 case X86::RETPOLINE_CALL32:
27677 return X86::CALLpcrel32;
27678 case X86::RETPOLINE_CALL64:
27679 return X86::CALL64pcrel32;
27680 case X86::RETPOLINE_TCRETURN32:
27681 return X86::TCRETURNdi;
27682 case X86::RETPOLINE_TCRETURN64:
27683 return X86::TCRETURNdi64;
27685 llvm_unreachable("not retpoline opcode");
27688 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27690 if (Subtarget.useRetpolineExternalThunk()) {
27691 // When using an external thunk for retpolines, we pick names that match the
27692 // names GCC happens to use as well. This helps simplify the implementation
27693 // of the thunks for kernels where they have no easy ability to create
27694 // aliases and are doing non-trivial configuration of the thunk's body. For
27695 // example, the Linux kernel will do boot-time hot patching of the thunk
27696 // bodies and cannot easily export aliases of these to loaded modules.
27698 // Note that at any point in the future, we may need to change the semantics
27699 // of how we implement retpolines and at that time will likely change the
27700 // name of the called thunk. Essentially, there is no hard guarantee that
27701 // LLVM will generate calls to specific thunks, we merely make a best-effort
27702 // attempt to help out kernels and other systems where duplicating the
27703 // thunks is costly.
27706 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27707 return "__x86_indirect_thunk_eax";
27709 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27710 return "__x86_indirect_thunk_ecx";
27712 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27713 return "__x86_indirect_thunk_edx";
27715 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27716 return "__x86_indirect_thunk_edi";
27718 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27719 return "__x86_indirect_thunk_r11";
27721 llvm_unreachable("unexpected reg for retpoline");
27724 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27727 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27728 return "__llvm_retpoline_eax";
27730 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27731 return "__llvm_retpoline_ecx";
27733 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27734 return "__llvm_retpoline_edx";
27736 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27737 return "__llvm_retpoline_edi";
27739 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27740 return "__llvm_retpoline_r11";
27742 llvm_unreachable("unexpected reg for retpoline");
27745 MachineBasicBlock *
27746 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27747 MachineBasicBlock *BB) const {
27748 // Copy the virtual register into the R11 physical register and
27749 // call the retpoline thunk.
27750 DebugLoc DL = MI.getDebugLoc();
27751 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27752 unsigned CalleeVReg = MI.getOperand(0).getReg();
27753 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27755 // Find an available scratch register to hold the callee. On 64-bit, we can
27756 // just use R11, but we scan for uses anyway to ensure we don't generate
27757 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27758 // already a register use operand to the call to hold the callee. If none
27759 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27760 // register and ESI is the base pointer to realigned stack frames with VLAs.
27761 SmallVector<unsigned, 3> AvailableRegs;
27762 if (Subtarget.is64Bit())
27763 AvailableRegs.push_back(X86::R11);
27765 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27767 // Zero out any registers that are already used.
27768 for (const auto &MO : MI.operands()) {
27769 if (MO.isReg() && MO.isUse())
27770 for (unsigned &Reg : AvailableRegs)
27771 if (Reg == MO.getReg())
27775 // Choose the first remaining non-zero available register.
27776 unsigned AvailableReg = 0;
27777 for (unsigned MaybeReg : AvailableRegs) {
27779 AvailableReg = MaybeReg;
27784 report_fatal_error("calling convention incompatible with retpoline, no "
27785 "available registers");
27787 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27789 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27790 .addReg(CalleeVReg);
27791 MI.getOperand(0).ChangeToES(Symbol);
27792 MI.setDesc(TII->get(Opc));
27793 MachineInstrBuilder(*BB->getParent(), &MI)
27794 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27798 /// SetJmp implies future control flow change upon calling the corresponding
27800 /// Instead of using the 'return' instruction, the long jump fixes the stack and
27801 /// performs an indirect branch. To do so it uses the registers that were stored
27802 /// in the jump buffer (when calling SetJmp).
27803 /// In case the shadow stack is enabled we need to fix it as well, because some
27804 /// return addresses will be skipped.
27805 /// The function will save the SSP for future fixing in the function
27806 /// emitLongJmpShadowStackFix.
27807 /// \sa emitLongJmpShadowStackFix
27808 /// \param [in] MI The temporary Machine Instruction for the builtin.
27809 /// \param [in] MBB The Machine Basic Block that will be modified.
27810 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
27811 MachineBasicBlock *MBB) const {
27812 DebugLoc DL = MI.getDebugLoc();
27813 MachineFunction *MF = MBB->getParent();
27814 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27815 MachineRegisterInfo &MRI = MF->getRegInfo();
27816 MachineInstrBuilder MIB;
27818 // Memory Reference.
27819 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27820 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27822 // Initialize a register with zero.
27823 MVT PVT = getPointerTy(MF->getDataLayout());
27824 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27825 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27826 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27827 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
27829 .addReg(ZReg, RegState::Undef)
27830 .addReg(ZReg, RegState::Undef);
27832 // Read the current SSP Register value to the zeroed register.
27833 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27834 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27835 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27837 // Write the SSP register value to offset 3 in input memory buffer.
27838 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27839 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
27840 const int64_t SSPOffset = 3 * PVT.getStoreSize();
27841 const unsigned MemOpndSlot = 1;
27842 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27843 if (i == X86::AddrDisp)
27844 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
27846 MIB.add(MI.getOperand(MemOpndSlot + i));
27848 MIB.addReg(SSPCopyReg);
27849 MIB.setMemRefs(MMOBegin, MMOEnd);
27852 MachineBasicBlock *
27853 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27854 MachineBasicBlock *MBB) const {
27855 DebugLoc DL = MI.getDebugLoc();
27856 MachineFunction *MF = MBB->getParent();
27857 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27858 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27859 MachineRegisterInfo &MRI = MF->getRegInfo();
27861 const BasicBlock *BB = MBB->getBasicBlock();
27862 MachineFunction::iterator I = ++MBB->getIterator();
27864 // Memory Reference
27865 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27866 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27869 unsigned MemOpndSlot = 0;
27871 unsigned CurOp = 0;
27873 DstReg = MI.getOperand(CurOp++).getReg();
27874 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27875 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27877 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27878 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27880 MemOpndSlot = CurOp;
27882 MVT PVT = getPointerTy(MF->getDataLayout());
27883 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27884 "Invalid Pointer Size!");
27886 // For v = setjmp(buf), we generate
27889 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27890 // SjLjSetup restoreMBB
27896 // v = phi(main, restore)
27899 // if base pointer being used, load it from frame
27902 MachineBasicBlock *thisMBB = MBB;
27903 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27904 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27905 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27906 MF->insert(I, mainMBB);
27907 MF->insert(I, sinkMBB);
27908 MF->push_back(restoreMBB);
27909 restoreMBB->setHasAddressTaken();
27911 MachineInstrBuilder MIB;
27913 // Transfer the remainder of BB and its successor edges to sinkMBB.
27914 sinkMBB->splice(sinkMBB->begin(), MBB,
27915 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27916 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27919 unsigned PtrStoreOpc = 0;
27920 unsigned LabelReg = 0;
27921 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27922 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27923 !isPositionIndependent();
27925 // Prepare IP either in reg or imm.
27926 if (!UseImmLabel) {
27927 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27928 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27929 LabelReg = MRI.createVirtualRegister(PtrRC);
27930 if (Subtarget.is64Bit()) {
27931 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27935 .addMBB(restoreMBB)
27938 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27939 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27940 .addReg(XII->getGlobalBaseReg(MF))
27943 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27947 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27949 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27950 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27951 if (i == X86::AddrDisp)
27952 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27954 MIB.add(MI.getOperand(MemOpndSlot + i));
27957 MIB.addReg(LabelReg);
27959 MIB.addMBB(restoreMBB);
27960 MIB.setMemRefs(MMOBegin, MMOEnd);
27962 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
27963 emitSetJmpShadowStackFix(MI, thisMBB);
27967 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27968 .addMBB(restoreMBB);
27970 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27971 MIB.addRegMask(RegInfo->getNoPreservedMask());
27972 thisMBB->addSuccessor(mainMBB);
27973 thisMBB->addSuccessor(restoreMBB);
27977 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27978 mainMBB->addSuccessor(sinkMBB);
27981 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27982 TII->get(X86::PHI), DstReg)
27983 .addReg(mainDstReg).addMBB(mainMBB)
27984 .addReg(restoreDstReg).addMBB(restoreMBB);
27987 if (RegInfo->hasBasePointer(*MF)) {
27988 const bool Uses64BitFramePtr =
27989 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27990 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27991 X86FI->setRestoreBasePointer(MF);
27992 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27993 unsigned BasePtr = RegInfo->getBaseRegister();
27994 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27995 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27996 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27997 .setMIFlag(MachineInstr::FrameSetup);
27999 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
28000 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
28001 restoreMBB->addSuccessor(sinkMBB);
28003 MI.eraseFromParent();
28007 /// Fix the shadow stack using the previously saved SSP pointer.
28008 /// \sa emitSetJmpShadowStackFix
28009 /// \param [in] MI The temporary Machine Instruction for the builtin.
28010 /// \param [in] MBB The Machine Basic Block that will be modified.
28011 /// \return The sink MBB that will perform the future indirect branch.
28012 MachineBasicBlock *
28013 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
28014 MachineBasicBlock *MBB) const {
28015 DebugLoc DL = MI.getDebugLoc();
28016 MachineFunction *MF = MBB->getParent();
28017 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28018 MachineRegisterInfo &MRI = MF->getRegInfo();
28020 // Memory Reference
28021 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28022 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28024 MVT PVT = getPointerTy(MF->getDataLayout());
28025 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
28028 // xor vreg1, vreg1
28030 // test vreg1, vreg1
28031 // je sinkMBB # Jump if Shadow Stack is not supported
28033 // mov buf+24/12(%rip), vreg2
28034 // sub vreg1, vreg2
28035 // jbe sinkMBB # No need to fix the Shadow Stack
28038 // incssp vreg2 # fix the SSP according to the lower 8 bits
28041 // fixShadowLoopPrepareMBB:
28044 // fixShadowLoopMBB:
28047 // jne fixShadowLoopMBB # Iterate until you finish fixing
28048 // # the Shadow Stack
28051 MachineFunction::iterator I = ++MBB->getIterator();
28052 const BasicBlock *BB = MBB->getBasicBlock();
28054 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
28055 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
28056 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
28057 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
28058 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
28059 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
28060 MF->insert(I, checkSspMBB);
28061 MF->insert(I, fallMBB);
28062 MF->insert(I, fixShadowMBB);
28063 MF->insert(I, fixShadowLoopPrepareMBB);
28064 MF->insert(I, fixShadowLoopMBB);
28065 MF->insert(I, sinkMBB);
28067 // Transfer the remainder of BB and its successor edges to sinkMBB.
28068 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
28070 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
28072 MBB->addSuccessor(checkSspMBB);
28074 // Initialize a register with zero.
28075 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
28076 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
28077 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
28079 .addReg(ZReg, RegState::Undef)
28080 .addReg(ZReg, RegState::Undef);
28082 // Read the current SSP Register value to the zeroed register.
28083 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
28084 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
28085 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
28087 // Check whether the result of the SSP register is zero and jump directly
28089 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
28090 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
28091 .addReg(SSPCopyReg)
28092 .addReg(SSPCopyReg);
28093 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28094 checkSspMBB->addSuccessor(sinkMBB);
28095 checkSspMBB->addSuccessor(fallMBB);
28097 // Reload the previously saved SSP register value.
28098 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
28099 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28100 const int64_t SPPOffset = 3 * PVT.getStoreSize();
28101 MachineInstrBuilder MIB =
28102 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
28103 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28104 if (i == X86::AddrDisp)
28105 MIB.addDisp(MI.getOperand(i), SPPOffset);
28107 MIB.add(MI.getOperand(i));
28109 MIB.setMemRefs(MMOBegin, MMOEnd);
28111 // Subtract the current SSP from the previous SSP.
28112 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
28113 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
28114 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
28115 .addReg(PrevSSPReg)
28116 .addReg(SSPCopyReg);
28118 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
28119 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
28120 fallMBB->addSuccessor(sinkMBB);
28121 fallMBB->addSuccessor(fixShadowMBB);
28123 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
28124 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
28125 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
28126 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
28127 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
28131 // Increase SSP when looking only on the lower 8 bits of the delta.
28132 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
28133 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
28135 // Reset the lower 8 bits.
28136 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
28137 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
28138 .addReg(SspFirstShrReg)
28141 // Jump if the result of the shift is zero.
28142 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28143 fixShadowMBB->addSuccessor(sinkMBB);
28144 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
28146 // Do a single shift left.
28147 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
28148 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
28149 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
28150 .addReg(SspSecondShrReg);
28152 // Save the value 128 to a register (will be used next with incssp).
28153 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
28154 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
28155 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
28157 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
28159 // Since incssp only looks at the lower 8 bits, we might need to do several
28160 // iterations of incssp until we finish fixing the shadow stack.
28161 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
28162 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
28163 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
28164 .addReg(SspAfterShlReg)
28165 .addMBB(fixShadowLoopPrepareMBB)
28167 .addMBB(fixShadowLoopMBB);
28169 // Every iteration we increase the SSP by 128.
28170 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
28172 // Every iteration we decrement the counter by 1.
28173 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
28174 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
28176 // Jump if the counter is not zero yet.
28177 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
28178 fixShadowLoopMBB->addSuccessor(sinkMBB);
28179 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
28184 MachineBasicBlock *
28185 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
28186 MachineBasicBlock *MBB) const {
28187 DebugLoc DL = MI.getDebugLoc();
28188 MachineFunction *MF = MBB->getParent();
28189 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28190 MachineRegisterInfo &MRI = MF->getRegInfo();
28192 // Memory Reference
28193 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28194 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28196 MVT PVT = getPointerTy(MF->getDataLayout());
28197 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
28198 "Invalid Pointer Size!");
28200 const TargetRegisterClass *RC =
28201 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28202 unsigned Tmp = MRI.createVirtualRegister(RC);
28203 // Since FP is only updated here but NOT referenced, it's treated as GPR.
28204 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28205 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
28206 unsigned SP = RegInfo->getStackRegister();
28208 MachineInstrBuilder MIB;
28210 const int64_t LabelOffset = 1 * PVT.getStoreSize();
28211 const int64_t SPOffset = 2 * PVT.getStoreSize();
28213 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28214 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
28216 MachineBasicBlock *thisMBB = MBB;
28218 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
28219 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
28220 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
28224 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
28225 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
28226 MIB.add(MI.getOperand(i));
28227 MIB.setMemRefs(MMOBegin, MMOEnd);
28230 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
28231 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28232 if (i == X86::AddrDisp)
28233 MIB.addDisp(MI.getOperand(i), LabelOffset);
28235 MIB.add(MI.getOperand(i));
28237 MIB.setMemRefs(MMOBegin, MMOEnd);
28240 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
28241 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28242 if (i == X86::AddrDisp)
28243 MIB.addDisp(MI.getOperand(i), SPOffset);
28245 MIB.add(MI.getOperand(i));
28247 MIB.setMemRefs(MMOBegin, MMOEnd);
28250 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
28252 MI.eraseFromParent();
28256 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
28257 MachineBasicBlock *MBB,
28258 MachineBasicBlock *DispatchBB,
28260 DebugLoc DL = MI.getDebugLoc();
28261 MachineFunction *MF = MBB->getParent();
28262 MachineRegisterInfo *MRI = &MF->getRegInfo();
28263 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28265 MVT PVT = getPointerTy(MF->getDataLayout());
28266 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
28271 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
28272 !isPositionIndependent();
28275 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
28277 const TargetRegisterClass *TRC =
28278 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28279 VR = MRI->createVirtualRegister(TRC);
28280 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
28282 if (Subtarget.is64Bit())
28283 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
28287 .addMBB(DispatchBB)
28290 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
28291 .addReg(0) /* TII->getGlobalBaseReg(MF) */
28294 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
28298 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
28299 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
28301 MIB.addMBB(DispatchBB);
28306 MachineBasicBlock *
28307 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
28308 MachineBasicBlock *BB) const {
28309 DebugLoc DL = MI.getDebugLoc();
28310 MachineFunction *MF = BB->getParent();
28311 MachineFrameInfo &MFI = MF->getFrameInfo();
28312 MachineRegisterInfo *MRI = &MF->getRegInfo();
28313 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28314 int FI = MFI.getFunctionContextIndex();
28316 // Get a mapping of the call site numbers to all of the landing pads they're
28317 // associated with.
28318 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
28319 unsigned MaxCSNum = 0;
28320 for (auto &MBB : *MF) {
28321 if (!MBB.isEHPad())
28324 MCSymbol *Sym = nullptr;
28325 for (const auto &MI : MBB) {
28326 if (MI.isDebugInstr())
28329 assert(MI.isEHLabel() && "expected EH_LABEL");
28330 Sym = MI.getOperand(0).getMCSymbol();
28334 if (!MF->hasCallSiteLandingPad(Sym))
28337 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
28338 CallSiteNumToLPad[CSI].push_back(&MBB);
28339 MaxCSNum = std::max(MaxCSNum, CSI);
28343 // Get an ordered list of the machine basic blocks for the jump table.
28344 std::vector<MachineBasicBlock *> LPadList;
28345 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
28346 LPadList.reserve(CallSiteNumToLPad.size());
28348 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
28349 for (auto &LP : CallSiteNumToLPad[CSI]) {
28350 LPadList.push_back(LP);
28351 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
28355 assert(!LPadList.empty() &&
28356 "No landing pad destinations for the dispatch jump table!");
28358 // Create the MBBs for the dispatch code.
28360 // Shove the dispatch's address into the return slot in the function context.
28361 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
28362 DispatchBB->setIsEHPad(true);
28364 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
28365 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
28366 DispatchBB->addSuccessor(TrapBB);
28368 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
28369 DispatchBB->addSuccessor(DispContBB);
28372 MF->push_back(DispatchBB);
28373 MF->push_back(DispContBB);
28374 MF->push_back(TrapBB);
28376 // Insert code into the entry block that creates and registers the function
28378 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
28380 // Create the jump table and associated information
28381 unsigned JTE = getJumpTableEncoding();
28382 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
28383 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
28385 const X86RegisterInfo &RI = TII->getRegisterInfo();
28386 // Add a register mask with no preserved registers. This results in all
28387 // registers being marked as clobbered.
28388 if (RI.hasBasePointer(*MF)) {
28389 const bool FPIs64Bit =
28390 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
28391 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
28392 MFI->setRestoreBasePointer(MF);
28394 unsigned FP = RI.getFrameRegister(*MF);
28395 unsigned BP = RI.getBaseRegister();
28396 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
28397 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
28398 MFI->getRestoreBasePointerOffset())
28399 .addRegMask(RI.getNoPreservedMask());
28401 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
28402 .addRegMask(RI.getNoPreservedMask());
28405 // IReg is used as an index in a memory operand and therefore can't be SP
28406 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
28407 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
28408 Subtarget.is64Bit() ? 8 : 4);
28409 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
28411 .addImm(LPadList.size());
28412 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
28414 if (Subtarget.is64Bit()) {
28415 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28416 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
28418 // leaq .LJTI0_0(%rip), BReg
28419 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
28423 .addJumpTableIndex(MJTI)
28425 // movzx IReg64, IReg
28426 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
28429 .addImm(X86::sub_32bit);
28432 case MachineJumpTableInfo::EK_BlockAddress:
28433 // jmpq *(BReg,IReg64,8)
28434 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
28441 case MachineJumpTableInfo::EK_LabelDifference32: {
28442 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
28443 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
28444 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28446 // movl (BReg,IReg64,4), OReg
28447 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
28453 // movsx OReg64, OReg
28454 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
28455 // addq BReg, OReg64, TReg
28456 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
28460 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
28464 llvm_unreachable("Unexpected jump table encoding");
28467 // jmpl *.LJTI0_0(,IReg,4)
28468 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
28472 .addJumpTableIndex(MJTI)
28476 // Add the jump table entries as successors to the MBB.
28477 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
28478 for (auto &LP : LPadList)
28479 if (SeenMBBs.insert(LP).second)
28480 DispContBB->addSuccessor(LP);
28482 // N.B. the order the invoke BBs are processed in doesn't matter here.
28483 SmallVector<MachineBasicBlock *, 64> MBBLPads;
28484 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
28485 for (MachineBasicBlock *MBB : InvokeBBs) {
28486 // Remove the landing pad successor from the invoke block and replace it
28487 // with the new dispatch block.
28488 // Keep a copy of Successors since it's modified inside the loop.
28489 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
28491 // FIXME: Avoid quadratic complexity.
28492 for (auto MBBS : Successors) {
28493 if (MBBS->isEHPad()) {
28494 MBB->removeSuccessor(MBBS);
28495 MBBLPads.push_back(MBBS);
28499 MBB->addSuccessor(DispatchBB);
28501 // Find the invoke call and mark all of the callee-saved registers as
28502 // 'implicit defined' so that they're spilled. This prevents code from
28503 // moving instructions to before the EH block, where they will never be
28505 for (auto &II : reverse(*MBB)) {
28509 DenseMap<unsigned, bool> DefRegs;
28510 for (auto &MOp : II.operands())
28512 DefRegs[MOp.getReg()] = true;
28514 MachineInstrBuilder MIB(*MF, &II);
28515 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
28516 unsigned Reg = SavedRegs[RI];
28518 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
28525 // Mark all former landing pads as non-landing pads. The dispatch is the only
28526 // landing pad now.
28527 for (auto &LP : MBBLPads)
28528 LP->setIsEHPad(false);
28530 // The instruction is gone now.
28531 MI.eraseFromParent();
28535 MachineBasicBlock *
28536 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28537 MachineBasicBlock *BB) const {
28538 MachineFunction *MF = BB->getParent();
28539 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28540 DebugLoc DL = MI.getDebugLoc();
28542 switch (MI.getOpcode()) {
28543 default: llvm_unreachable("Unexpected instr type to insert");
28544 case X86::TLS_addr32:
28545 case X86::TLS_addr64:
28546 case X86::TLS_base_addr32:
28547 case X86::TLS_base_addr64:
28548 return EmitLoweredTLSAddr(MI, BB);
28549 case X86::RETPOLINE_CALL32:
28550 case X86::RETPOLINE_CALL64:
28551 case X86::RETPOLINE_TCRETURN32:
28552 case X86::RETPOLINE_TCRETURN64:
28553 return EmitLoweredRetpoline(MI, BB);
28554 case X86::CATCHRET:
28555 return EmitLoweredCatchRet(MI, BB);
28556 case X86::CATCHPAD:
28557 return EmitLoweredCatchPad(MI, BB);
28558 case X86::SEG_ALLOCA_32:
28559 case X86::SEG_ALLOCA_64:
28560 return EmitLoweredSegAlloca(MI, BB);
28561 case X86::TLSCall_32:
28562 case X86::TLSCall_64:
28563 return EmitLoweredTLSCall(MI, BB);
28564 case X86::CMOV_FR32:
28565 case X86::CMOV_FR64:
28566 case X86::CMOV_F128:
28567 case X86::CMOV_GR8:
28568 case X86::CMOV_GR16:
28569 case X86::CMOV_GR32:
28570 case X86::CMOV_RFP32:
28571 case X86::CMOV_RFP64:
28572 case X86::CMOV_RFP80:
28573 case X86::CMOV_V2F64:
28574 case X86::CMOV_V2I64:
28575 case X86::CMOV_V4F32:
28576 case X86::CMOV_V4F64:
28577 case X86::CMOV_V4I64:
28578 case X86::CMOV_V16F32:
28579 case X86::CMOV_V8F32:
28580 case X86::CMOV_V8F64:
28581 case X86::CMOV_V8I64:
28582 case X86::CMOV_V8I1:
28583 case X86::CMOV_V16I1:
28584 case X86::CMOV_V32I1:
28585 case X86::CMOV_V64I1:
28586 return EmitLoweredSelect(MI, BB);
28588 case X86::RDFLAGS32:
28589 case X86::RDFLAGS64: {
28591 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28592 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28593 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28594 // Permit reads of the EFLAGS and DF registers without them being defined.
28595 // This intrinsic exists to read external processor state in flags, such as
28596 // the trap flag, interrupt flag, and direction flag, none of which are
28597 // modeled by the backend.
28598 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
28599 "Unexpected register in operand!");
28600 Push->getOperand(2).setIsUndef();
28601 assert(Push->getOperand(3).getReg() == X86::DF &&
28602 "Unexpected register in operand!");
28603 Push->getOperand(3).setIsUndef();
28604 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28606 MI.eraseFromParent(); // The pseudo is gone now.
28610 case X86::WRFLAGS32:
28611 case X86::WRFLAGS64: {
28613 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28615 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28616 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28617 BuildMI(*BB, MI, DL, TII->get(PopF));
28619 MI.eraseFromParent(); // The pseudo is gone now.
28623 case X86::RELEASE_FADD32mr:
28624 case X86::RELEASE_FADD64mr:
28625 return EmitLoweredAtomicFP(MI, BB);
28627 case X86::FP32_TO_INT16_IN_MEM:
28628 case X86::FP32_TO_INT32_IN_MEM:
28629 case X86::FP32_TO_INT64_IN_MEM:
28630 case X86::FP64_TO_INT16_IN_MEM:
28631 case X86::FP64_TO_INT32_IN_MEM:
28632 case X86::FP64_TO_INT64_IN_MEM:
28633 case X86::FP80_TO_INT16_IN_MEM:
28634 case X86::FP80_TO_INT32_IN_MEM:
28635 case X86::FP80_TO_INT64_IN_MEM: {
28636 // Change the floating point control register to use "round towards zero"
28637 // mode when truncating to an integer value.
28638 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28639 addFrameReference(BuildMI(*BB, MI, DL,
28640 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28642 // Load the old value of the high byte of the control word...
28644 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28645 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28648 // Set the high part to be round to zero...
28649 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28652 // Reload the modified control word now...
28653 addFrameReference(BuildMI(*BB, MI, DL,
28654 TII->get(X86::FLDCW16m)), CWFrameIdx);
28656 // Restore the memory image of control word to original value
28657 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28660 // Get the X86 opcode to use.
28662 switch (MI.getOpcode()) {
28663 default: llvm_unreachable("illegal opcode!");
28664 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28665 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28666 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28667 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28668 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28669 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28670 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28671 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28672 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28675 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28676 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28677 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28679 // Reload the original control word now.
28680 addFrameReference(BuildMI(*BB, MI, DL,
28681 TII->get(X86::FLDCW16m)), CWFrameIdx);
28683 MI.eraseFromParent(); // The pseudo instruction is gone now.
28686 // Thread synchronization.
28688 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28689 case X86::MONITORX:
28690 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28694 return emitClzero(&MI, BB, Subtarget);
28698 return emitWRPKRU(MI, BB, Subtarget);
28700 return emitRDPKRU(MI, BB, Subtarget);
28703 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28705 case X86::VASTART_SAVE_XMM_REGS:
28706 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28708 case X86::VAARG_64:
28709 return EmitVAARG64WithCustomInserter(MI, BB);
28711 case X86::EH_SjLj_SetJmp32:
28712 case X86::EH_SjLj_SetJmp64:
28713 return emitEHSjLjSetJmp(MI, BB);
28715 case X86::EH_SjLj_LongJmp32:
28716 case X86::EH_SjLj_LongJmp64:
28717 return emitEHSjLjLongJmp(MI, BB);
28719 case X86::Int_eh_sjlj_setup_dispatch:
28720 return EmitSjLjDispatchBlock(MI, BB);
28722 case TargetOpcode::STATEPOINT:
28723 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28724 // this point in the process. We diverge later.
28725 return emitPatchPoint(MI, BB);
28727 case TargetOpcode::STACKMAP:
28728 case TargetOpcode::PATCHPOINT:
28729 return emitPatchPoint(MI, BB);
28731 case TargetOpcode::PATCHABLE_EVENT_CALL:
28732 return emitXRayCustomEvent(MI, BB);
28734 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
28735 return emitXRayTypedEvent(MI, BB);
28737 case X86::LCMPXCHG8B: {
28738 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28739 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28740 // requires a memory operand. If it happens that current architecture is
28741 // i686 and for current function we need a base pointer
28742 // - which is ESI for i686 - register allocator would not be able to
28743 // allocate registers for an address in form of X(%reg, %reg, Y)
28744 // - there never would be enough unreserved registers during regalloc
28745 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28746 // We are giving a hand to register allocator by precomputing the address in
28747 // a new vreg using LEA.
28749 // If it is not i686 or there is no base pointer - nothing to do here.
28750 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28753 // Even though this code does not necessarily needs the base pointer to
28754 // be ESI, we check for that. The reason: if this assert fails, there are
28755 // some changes happened in the compiler base pointer handling, which most
28756 // probably have to be addressed somehow here.
28757 assert(TRI->getBaseRegister() == X86::ESI &&
28758 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28759 "base pointer in mind");
28761 MachineRegisterInfo &MRI = MF->getRegInfo();
28762 MVT SPTy = getPointerTy(MF->getDataLayout());
28763 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28764 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28766 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28767 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28768 // does not use index register.
28769 if (AM.IndexReg == X86::NoRegister)
28772 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28773 // four operand definitions that are E[ABCD] registers. We skip them and
28774 // then insert the LEA.
28775 MachineBasicBlock::iterator MBBI(MI);
28776 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28777 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28780 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28782 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28786 case X86::LCMPXCHG16B:
28788 case X86::LCMPXCHG8B_SAVE_EBX:
28789 case X86::LCMPXCHG16B_SAVE_RBX: {
28791 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28792 if (!BB->isLiveIn(BasePtr))
28793 BB->addLiveIn(BasePtr);
28799 //===----------------------------------------------------------------------===//
28800 // X86 Optimization Hooks
28801 //===----------------------------------------------------------------------===//
28804 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28805 const APInt &Demanded,
28806 TargetLoweringOpt &TLO) const {
28807 // Only optimize Ands to prevent shrinking a constant that could be
28808 // matched by movzx.
28809 if (Op.getOpcode() != ISD::AND)
28812 EVT VT = Op.getValueType();
28818 unsigned Size = VT.getSizeInBits();
28820 // Make sure the RHS really is a constant.
28821 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28825 const APInt &Mask = C->getAPIntValue();
28827 // Clear all non-demanded bits initially.
28828 APInt ShrunkMask = Mask & Demanded;
28830 // Find the width of the shrunk mask.
28831 unsigned Width = ShrunkMask.getActiveBits();
28833 // If the mask is all 0s there's nothing to do here.
28837 // Find the next power of 2 width, rounding up to a byte.
28838 Width = PowerOf2Ceil(std::max(Width, 8U));
28839 // Truncate the width to size to handle illegal types.
28840 Width = std::min(Width, Size);
28842 // Calculate a possible zero extend mask for this constant.
28843 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28845 // If we aren't changing the mask, just return true to keep it and prevent
28846 // the caller from optimizing.
28847 if (ZeroExtendMask == Mask)
28850 // Make sure the new mask can be represented by a combination of mask bits
28851 // and non-demanded bits.
28852 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28855 // Replace the constant with the zero extend mask.
28857 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28858 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28859 return TLO.CombineTo(Op, NewOp);
28862 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28864 const APInt &DemandedElts,
28865 const SelectionDAG &DAG,
28866 unsigned Depth) const {
28867 unsigned BitWidth = Known.getBitWidth();
28868 unsigned Opc = Op.getOpcode();
28869 EVT VT = Op.getValueType();
28870 assert((Opc >= ISD::BUILTIN_OP_END ||
28871 Opc == ISD::INTRINSIC_WO_CHAIN ||
28872 Opc == ISD::INTRINSIC_W_CHAIN ||
28873 Opc == ISD::INTRINSIC_VOID) &&
28874 "Should use MaskedValueIsZero if you don't know whether Op"
28875 " is a target node!");
28880 case X86ISD::SETCC:
28881 Known.Zero.setBitsFrom(1);
28883 case X86ISD::MOVMSK: {
28884 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28885 Known.Zero.setBitsFrom(NumLoBits);
28888 case X86ISD::PEXTRB:
28889 case X86ISD::PEXTRW: {
28890 SDValue Src = Op.getOperand(0);
28891 EVT SrcVT = Src.getValueType();
28892 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28893 Op.getConstantOperandVal(1));
28894 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28895 Known = Known.zextOrTrunc(BitWidth);
28896 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28899 case X86ISD::VSHLI:
28900 case X86ISD::VSRLI: {
28901 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28902 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28903 Known.setAllZero();
28907 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28908 unsigned ShAmt = ShiftImm->getZExtValue();
28909 if (Opc == X86ISD::VSHLI) {
28910 Known.Zero <<= ShAmt;
28911 Known.One <<= ShAmt;
28912 // Low bits are known zero.
28913 Known.Zero.setLowBits(ShAmt);
28915 Known.Zero.lshrInPlace(ShAmt);
28916 Known.One.lshrInPlace(ShAmt);
28917 // High bits are known zero.
28918 Known.Zero.setHighBits(ShAmt);
28923 case X86ISD::PACKUS: {
28924 // PACKUS is just a truncation if the upper half is zero.
28925 // TODO: Add DemandedElts support.
28927 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
28928 DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
28929 Known.One &= Known2.One;
28930 Known.Zero &= Known2.Zero;
28931 if (Known.countMinLeadingZeros() < BitWidth)
28933 Known = Known.trunc(BitWidth);
28936 case X86ISD::VZEXT: {
28937 // TODO: Add DemandedElts support.
28938 SDValue N0 = Op.getOperand(0);
28939 unsigned NumElts = VT.getVectorNumElements();
28941 EVT SrcVT = N0.getValueType();
28942 unsigned InNumElts = SrcVT.getVectorNumElements();
28943 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28944 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28946 Known = KnownBits(InBitWidth);
28947 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28948 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28949 Known = Known.zext(BitWidth);
28950 Known.Zero.setBitsFrom(InBitWidth);
28953 case X86ISD::CMOV: {
28954 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28955 // If we don't know any bits, early out.
28956 if (Known.isUnknown())
28959 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28961 // Only known if known in both the LHS and RHS.
28962 Known.One &= Known2.One;
28963 Known.Zero &= Known2.Zero;
28966 case X86ISD::UDIVREM8_ZEXT_HREG:
28967 // TODO: Support more than just the zero extended bits?
28968 if (Op.getResNo() != 1)
28970 // The remainder is zero extended.
28971 Known.Zero.setBitsFrom(8);
28975 // Handle target shuffles.
28976 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
28977 if (isTargetShuffle(Opc)) {
28979 SmallVector<int, 64> Mask;
28980 SmallVector<SDValue, 2> Ops;
28981 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
28983 unsigned NumOps = Ops.size();
28984 unsigned NumElts = VT.getVectorNumElements();
28985 if (Mask.size() == NumElts) {
28986 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
28987 Known.Zero.setAllBits(); Known.One.setAllBits();
28988 for (unsigned i = 0; i != NumElts; ++i) {
28989 if (!DemandedElts[i])
28992 if (M == SM_SentinelUndef) {
28993 // For UNDEF elements, we don't know anything about the common state
28994 // of the shuffle result.
28997 } else if (M == SM_SentinelZero) {
28998 Known.One.clearAllBits();
29001 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
29002 "Shuffle index out of range");
29004 unsigned OpIdx = (unsigned)M / NumElts;
29005 unsigned EltIdx = (unsigned)M % NumElts;
29006 if (Ops[OpIdx].getValueType() != VT) {
29007 // TODO - handle target shuffle ops with different value types.
29011 DemandedOps[OpIdx].setBit(EltIdx);
29013 // Known bits are the values that are shared by every demanded element.
29014 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
29015 if (!DemandedOps[i])
29018 DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
29019 Known.One &= Known2.One;
29020 Known.Zero &= Known2.Zero;
29027 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
29028 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
29029 unsigned Depth) const {
29030 unsigned VTBits = Op.getScalarValueSizeInBits();
29031 unsigned Opcode = Op.getOpcode();
29033 case X86ISD::SETCC_CARRY:
29034 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
29037 case X86ISD::VSEXT: {
29038 // TODO: Add DemandedElts support.
29039 SDValue Src = Op.getOperand(0);
29040 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29041 Tmp += VTBits - Src.getScalarValueSizeInBits();
29045 case X86ISD::VTRUNC: {
29046 // TODO: Add DemandedElts support.
29047 SDValue Src = Op.getOperand(0);
29048 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
29049 assert(VTBits < NumSrcBits && "Illegal truncation input type");
29050 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29051 if (Tmp > (NumSrcBits - VTBits))
29052 return Tmp - (NumSrcBits - VTBits);
29056 case X86ISD::PACKSS: {
29057 // PACKSS is just a truncation if the sign bits extend to the packed size.
29058 // TODO: Add DemandedElts support.
29059 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
29060 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
29061 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
29062 unsigned Tmp = std::min(Tmp0, Tmp1);
29063 if (Tmp > (SrcBits - VTBits))
29064 return Tmp - (SrcBits - VTBits);
29068 case X86ISD::VSHLI: {
29069 SDValue Src = Op.getOperand(0);
29070 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29071 if (ShiftVal.uge(VTBits))
29072 return VTBits; // Shifted all bits out --> zero.
29073 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29074 if (ShiftVal.uge(Tmp))
29075 return 1; // Shifted all sign bits out --> unknown.
29076 return Tmp - ShiftVal.getZExtValue();
29079 case X86ISD::VSRAI: {
29080 SDValue Src = Op.getOperand(0);
29081 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29082 if (ShiftVal.uge(VTBits - 1))
29083 return VTBits; // Sign splat.
29084 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29086 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
29089 case X86ISD::PCMPGT:
29090 case X86ISD::PCMPEQ:
29092 case X86ISD::VPCOM:
29093 case X86ISD::VPCOMU:
29094 // Vector compares return zero/all-bits result values.
29097 case X86ISD::CMOV: {
29098 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
29099 if (Tmp0 == 1) return 1; // Early out.
29100 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
29101 return std::min(Tmp0, Tmp1);
29103 case X86ISD::SDIVREM8_SEXT_HREG:
29104 // TODO: Support more than just the sign extended bits?
29105 if (Op.getResNo() != 1)
29107 // The remainder is sign extended.
29115 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
29116 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
29117 return N->getOperand(0);
29121 /// Returns true (and the GlobalValue and the offset) if the node is a
29122 /// GlobalAddress + offset.
29123 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
29124 const GlobalValue* &GA,
29125 int64_t &Offset) const {
29126 if (N->getOpcode() == X86ISD::Wrapper) {
29127 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
29128 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
29129 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
29133 return TargetLowering::isGAPlusOffset(N, GA, Offset);
29136 // Attempt to match a combined shuffle mask against supported unary shuffle
29138 // TODO: Investigate sharing more of this with shuffle lowering.
29139 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29140 bool AllowFloatDomain, bool AllowIntDomain,
29141 SDValue &V1, const SDLoc &DL,
29143 const X86Subtarget &Subtarget,
29144 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
29145 unsigned NumMaskElts = Mask.size();
29146 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
29148 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
29149 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
29150 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
29151 Shuffle = X86ISD::VZEXT_MOVL;
29152 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29156 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
29157 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
29158 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
29159 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
29160 unsigned MaxScale = 64 / MaskEltSize;
29161 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
29163 unsigned NumDstElts = NumMaskElts / Scale;
29164 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
29165 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
29166 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
29169 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
29170 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
29171 MVT::getIntegerVT(MaskEltSize);
29172 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
29174 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
29175 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
29176 Shuffle = unsigned(X86ISD::VZEXT);
29178 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
29180 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
29181 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
29187 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
29188 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
29189 isUndefOrEqual(Mask[0], 0) &&
29190 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
29191 Shuffle = X86ISD::VZEXT_MOVL;
29192 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29196 // Check if we have SSE3 which will let us use MOVDDUP etc. The
29197 // instructions are no slower than UNPCKLPD but has the option to
29198 // fold the input operand into even an unaligned memory load.
29199 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
29200 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
29201 Shuffle = X86ISD::MOVDDUP;
29202 SrcVT = DstVT = MVT::v2f64;
29205 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29206 Shuffle = X86ISD::MOVSLDUP;
29207 SrcVT = DstVT = MVT::v4f32;
29210 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
29211 Shuffle = X86ISD::MOVSHDUP;
29212 SrcVT = DstVT = MVT::v4f32;
29217 if (MaskVT.is256BitVector() && AllowFloatDomain) {
29218 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
29219 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29220 Shuffle = X86ISD::MOVDDUP;
29221 SrcVT = DstVT = MVT::v4f64;
29224 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29225 Shuffle = X86ISD::MOVSLDUP;
29226 SrcVT = DstVT = MVT::v8f32;
29229 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
29230 Shuffle = X86ISD::MOVSHDUP;
29231 SrcVT = DstVT = MVT::v8f32;
29236 if (MaskVT.is512BitVector() && AllowFloatDomain) {
29237 assert(Subtarget.hasAVX512() &&
29238 "AVX512 required for 512-bit vector shuffles");
29239 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29240 Shuffle = X86ISD::MOVDDUP;
29241 SrcVT = DstVT = MVT::v8f64;
29244 if (isTargetShuffleEquivalent(
29245 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
29246 Shuffle = X86ISD::MOVSLDUP;
29247 SrcVT = DstVT = MVT::v16f32;
29250 if (isTargetShuffleEquivalent(
29251 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
29252 Shuffle = X86ISD::MOVSHDUP;
29253 SrcVT = DstVT = MVT::v16f32;
29258 // Attempt to match against broadcast-from-vector.
29259 if (Subtarget.hasAVX2()) {
29260 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
29261 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
29262 SrcVT = DstVT = MaskVT;
29263 Shuffle = X86ISD::VBROADCAST;
29271 // Attempt to match a combined shuffle mask against supported unary immediate
29272 // permute instructions.
29273 // TODO: Investigate sharing more of this with shuffle lowering.
29274 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29275 const APInt &Zeroable,
29276 bool AllowFloatDomain,
29277 bool AllowIntDomain,
29278 const X86Subtarget &Subtarget,
29279 unsigned &Shuffle, MVT &ShuffleVT,
29280 unsigned &PermuteImm) {
29281 unsigned NumMaskElts = Mask.size();
29282 unsigned InputSizeInBits = MaskVT.getSizeInBits();
29283 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
29284 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
29286 bool ContainsZeros =
29287 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29289 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
29290 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
29291 // Check for lane crossing permutes.
29292 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
29293 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
29294 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
29295 Shuffle = X86ISD::VPERMI;
29296 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
29297 PermuteImm = getV4X86ShuffleImm(Mask);
29300 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
29301 SmallVector<int, 4> RepeatedMask;
29302 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
29303 Shuffle = X86ISD::VPERMI;
29304 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
29305 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
29309 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
29310 // VPERMILPD can permute with a non-repeating shuffle.
29311 Shuffle = X86ISD::VPERMILPI;
29312 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
29314 for (int i = 0, e = Mask.size(); i != e; ++i) {
29316 if (M == SM_SentinelUndef)
29318 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
29319 PermuteImm |= (M & 1) << i;
29325 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
29326 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
29327 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
29328 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
29329 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
29330 SmallVector<int, 4> RepeatedMask;
29331 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29332 // Narrow the repeated mask to create 32-bit element permutes.
29333 SmallVector<int, 4> WordMask = RepeatedMask;
29334 if (MaskScalarSizeInBits == 64)
29335 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
29337 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
29338 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
29339 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
29340 PermuteImm = getV4X86ShuffleImm(WordMask);
29345 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
29346 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
29347 SmallVector<int, 4> RepeatedMask;
29348 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29349 ArrayRef<int> LoMask(Mask.data() + 0, 4);
29350 ArrayRef<int> HiMask(Mask.data() + 4, 4);
29352 // PSHUFLW: permute lower 4 elements only.
29353 if (isUndefOrInRange(LoMask, 0, 4) &&
29354 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
29355 Shuffle = X86ISD::PSHUFLW;
29356 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29357 PermuteImm = getV4X86ShuffleImm(LoMask);
29361 // PSHUFHW: permute upper 4 elements only.
29362 if (isUndefOrInRange(HiMask, 4, 8) &&
29363 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
29364 // Offset the HiMask so that we can create the shuffle immediate.
29365 int OffsetHiMask[4];
29366 for (int i = 0; i != 4; ++i)
29367 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
29369 Shuffle = X86ISD::PSHUFHW;
29370 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29371 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
29377 // Attempt to match against byte/bit shifts.
29378 // FIXME: Add 512-bit support.
29379 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29380 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29381 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
29382 MaskScalarSizeInBits, Mask,
29383 0, Zeroable, Subtarget);
29384 if (0 < ShiftAmt) {
29385 PermuteImm = (unsigned)ShiftAmt;
29393 // Attempt to match a combined unary shuffle mask against supported binary
29394 // shuffle instructions.
29395 // TODO: Investigate sharing more of this with shuffle lowering.
29396 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29397 bool AllowFloatDomain, bool AllowIntDomain,
29398 SDValue &V1, SDValue &V2, const SDLoc &DL,
29400 const X86Subtarget &Subtarget,
29401 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
29403 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29405 if (MaskVT.is128BitVector()) {
29406 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
29408 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
29409 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
29410 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29413 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
29415 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
29416 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29419 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
29420 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29422 Shuffle = X86ISD::MOVSD;
29423 SrcVT = DstVT = MVT::v2f64;
29426 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
29427 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29428 Shuffle = X86ISD::MOVSS;
29429 SrcVT = DstVT = MVT::v4f32;
29434 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
29435 // TODO add support for 256/512-bit types.
29436 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
29437 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
29444 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
29445 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
29446 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29447 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
29448 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
29449 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
29450 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
29452 SrcVT = DstVT = MaskVT;
29453 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
29454 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
29462 static bool matchBinaryPermuteVectorShuffle(
29463 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
29464 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
29465 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
29466 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
29467 unsigned NumMaskElts = Mask.size();
29468 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29470 // Attempt to match against PALIGNR byte rotate.
29471 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29472 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29473 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
29474 if (0 < ByteRotation) {
29475 Shuffle = X86ISD::PALIGNR;
29476 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
29477 PermuteImm = ByteRotation;
29482 // Attempt to combine to X86ISD::BLENDI.
29483 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
29484 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
29485 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
29486 uint64_t BlendMask = 0;
29487 bool ForceV1Zero = false, ForceV2Zero = false;
29488 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
29489 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
29491 if (MaskVT == MVT::v16i16) {
29492 // We can only use v16i16 PBLENDW if the lanes are repeated.
29493 SmallVector<int, 8> RepeatedMask;
29494 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
29496 assert(RepeatedMask.size() == 8 &&
29497 "Repeated mask size doesn't match!");
29499 for (int i = 0; i < 8; ++i)
29500 if (RepeatedMask[i] >= 8)
29501 PermuteImm |= 1 << i;
29502 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29503 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29504 Shuffle = X86ISD::BLENDI;
29505 ShuffleVT = MaskVT;
29509 // Determine a type compatible with X86ISD::BLENDI.
29510 ShuffleVT = MaskVT;
29511 if (Subtarget.hasAVX2()) {
29512 if (ShuffleVT == MVT::v4i64)
29513 ShuffleVT = MVT::v8i32;
29514 else if (ShuffleVT == MVT::v2i64)
29515 ShuffleVT = MVT::v4i32;
29517 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
29518 ShuffleVT = MVT::v8i16;
29519 else if (ShuffleVT == MVT::v4i64)
29520 ShuffleVT = MVT::v4f64;
29521 else if (ShuffleVT == MVT::v8i32)
29522 ShuffleVT = MVT::v8f32;
29525 if (!ShuffleVT.isFloatingPoint()) {
29526 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
29528 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
29529 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
29530 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
29533 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29534 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29535 PermuteImm = (unsigned)BlendMask;
29536 Shuffle = X86ISD::BLENDI;
29542 // Attempt to combine to INSERTPS.
29543 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
29544 MaskVT.is128BitVector()) {
29545 if (Zeroable.getBoolValue() &&
29546 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
29547 Shuffle = X86ISD::INSERTPS;
29548 ShuffleVT = MVT::v4f32;
29553 // Attempt to combine to SHUFPD.
29554 if (AllowFloatDomain && EltSizeInBits == 64 &&
29555 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29556 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29557 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29558 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
29559 Shuffle = X86ISD::SHUFP;
29560 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
29565 // Attempt to combine to SHUFPS.
29566 if (AllowFloatDomain && EltSizeInBits == 32 &&
29567 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
29568 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29569 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29570 SmallVector<int, 4> RepeatedMask;
29571 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
29572 // Match each half of the repeated mask, to determine if its just
29573 // referencing one of the vectors, is zeroable or entirely undef.
29574 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
29575 int M0 = RepeatedMask[Offset];
29576 int M1 = RepeatedMask[Offset + 1];
29578 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29579 return DAG.getUNDEF(MaskVT);
29580 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29581 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29582 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29583 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29584 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29585 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29586 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29588 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29589 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29590 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29597 int ShufMask[4] = {-1, -1, -1, -1};
29598 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29599 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29604 Shuffle = X86ISD::SHUFP;
29605 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29606 PermuteImm = getV4X86ShuffleImm(ShufMask);
29615 /// Combine an arbitrary chain of shuffles into a single instruction if
29618 /// This is the leaf of the recursive combine below. When we have found some
29619 /// chain of single-use x86 shuffle instructions and accumulated the combined
29620 /// shuffle mask represented by them, this will try to pattern match that mask
29621 /// into either a single instruction if there is a special purpose instruction
29622 /// for this operation, or into a PSHUFB instruction which is a fully general
29623 /// instruction but should only be used to replace chains over a certain depth.
29624 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29625 ArrayRef<int> BaseMask, int Depth,
29626 bool HasVariableMask, SelectionDAG &DAG,
29627 const X86Subtarget &Subtarget) {
29628 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29629 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29630 "Unexpected number of shuffle inputs!");
29632 // Find the inputs that enter the chain. Note that multiple uses are OK
29633 // here, we're not going to remove the operands we find.
29634 bool UnaryShuffle = (Inputs.size() == 1);
29635 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29636 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29637 : peekThroughBitcasts(Inputs[1]));
29639 MVT VT1 = V1.getSimpleValueType();
29640 MVT VT2 = V2.getSimpleValueType();
29641 MVT RootVT = Root.getSimpleValueType();
29642 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29643 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29644 "Vector size mismatch");
29649 unsigned NumBaseMaskElts = BaseMask.size();
29650 if (NumBaseMaskElts == 1) {
29651 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29652 return DAG.getBitcast(RootVT, V1);
29655 unsigned RootSizeInBits = RootVT.getSizeInBits();
29656 unsigned NumRootElts = RootVT.getVectorNumElements();
29657 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29658 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29659 (RootVT.isFloatingPoint() && Depth >= 2) ||
29660 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29662 // Don't combine if we are a AVX512/EVEX target and the mask element size
29663 // is different from the root element size - this would prevent writemasks
29664 // from being reused.
29665 // TODO - this currently prevents all lane shuffles from occurring.
29666 // TODO - check for writemasks usage instead of always preventing combining.
29667 // TODO - attempt to narrow Mask back to writemask size.
29668 bool IsEVEXShuffle =
29669 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29671 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29673 // Handle 128-bit lane shuffles of 256-bit vectors.
29674 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29675 // we need to use the zeroing feature.
29676 // TODO - this should support binary shuffles.
29677 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29678 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29679 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29680 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29681 return SDValue(); // Nothing to do!
29682 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29683 unsigned PermMask = 0;
29684 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29685 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29687 Res = DAG.getBitcast(ShuffleVT, V1);
29688 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29689 DAG.getUNDEF(ShuffleVT),
29690 DAG.getConstant(PermMask, DL, MVT::i8));
29691 return DAG.getBitcast(RootVT, Res);
29694 // For masks that have been widened to 128-bit elements or more,
29695 // narrow back down to 64-bit elements.
29696 SmallVector<int, 64> Mask;
29697 if (BaseMaskEltSizeInBits > 64) {
29698 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29699 int MaskScale = BaseMaskEltSizeInBits / 64;
29700 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29702 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29705 unsigned NumMaskElts = Mask.size();
29706 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29708 // Determine the effective mask value type.
29709 FloatDomain &= (32 <= MaskEltSizeInBits);
29710 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29711 : MVT::getIntegerVT(MaskEltSizeInBits);
29712 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29714 // Only allow legal mask types.
29715 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29718 // Attempt to match the mask against known shuffle patterns.
29719 MVT ShuffleSrcVT, ShuffleVT;
29720 unsigned Shuffle, PermuteImm;
29722 // Which shuffle domains are permitted?
29723 // Permit domain crossing at higher combine depths.
29724 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29725 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29726 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29728 // Determine zeroable mask elements.
29729 APInt Zeroable(NumMaskElts, 0);
29730 for (unsigned i = 0; i != NumMaskElts; ++i)
29731 if (isUndefOrZero(Mask[i]))
29732 Zeroable.setBit(i);
29734 if (UnaryShuffle) {
29735 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29736 // directly if we don't shuffle the lower element and we shuffle the upper
29737 // (zero) elements within themselves.
29738 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29739 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29740 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29741 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29742 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29743 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29744 return DAG.getBitcast(RootVT, V1);
29748 SDValue NewV1 = V1; // Save operand in case early exit happens.
29749 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29750 NewV1, DL, DAG, Subtarget, Shuffle,
29751 ShuffleSrcVT, ShuffleVT) &&
29752 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29753 if (Depth == 1 && Root.getOpcode() == Shuffle)
29754 return SDValue(); // Nothing to do!
29755 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29756 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29757 return DAG.getBitcast(RootVT, Res);
29760 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29761 AllowIntDomain, Subtarget, Shuffle,
29762 ShuffleVT, PermuteImm) &&
29763 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29764 if (Depth == 1 && Root.getOpcode() == Shuffle)
29765 return SDValue(); // Nothing to do!
29766 Res = DAG.getBitcast(ShuffleVT, V1);
29767 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29768 DAG.getConstant(PermuteImm, DL, MVT::i8));
29769 return DAG.getBitcast(RootVT, Res);
29773 SDValue NewV1 = V1; // Save operands in case early exit happens.
29774 SDValue NewV2 = V2;
29775 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29776 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29777 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29778 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29779 if (Depth == 1 && Root.getOpcode() == Shuffle)
29780 return SDValue(); // Nothing to do!
29781 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29782 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29783 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29784 return DAG.getBitcast(RootVT, Res);
29787 NewV1 = V1; // Save operands in case early exit happens.
29789 if (matchBinaryPermuteVectorShuffle(
29790 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29791 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29792 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29793 if (Depth == 1 && Root.getOpcode() == Shuffle)
29794 return SDValue(); // Nothing to do!
29795 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29796 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29797 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29798 DAG.getConstant(PermuteImm, DL, MVT::i8));
29799 return DAG.getBitcast(RootVT, Res);
29802 // Typically from here on, we need an integer version of MaskVT.
29803 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29804 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29806 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29807 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29808 uint64_t BitLen, BitIdx;
29809 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29811 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29812 return SDValue(); // Nothing to do!
29813 V1 = DAG.getBitcast(IntMaskVT, V1);
29814 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29815 DAG.getConstant(BitLen, DL, MVT::i8),
29816 DAG.getConstant(BitIdx, DL, MVT::i8));
29817 return DAG.getBitcast(RootVT, Res);
29820 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29821 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29822 return SDValue(); // Nothing to do!
29823 V1 = DAG.getBitcast(IntMaskVT, V1);
29824 V2 = DAG.getBitcast(IntMaskVT, V2);
29825 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29826 DAG.getConstant(BitLen, DL, MVT::i8),
29827 DAG.getConstant(BitIdx, DL, MVT::i8));
29828 return DAG.getBitcast(RootVT, Res);
29832 // Don't try to re-form single instruction chains under any circumstances now
29833 // that we've done encoding canonicalization for them.
29837 // Depth threshold above which we can efficiently use variable mask shuffles.
29838 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29839 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29841 bool MaskContainsZeros =
29842 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29844 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29845 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29846 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29847 ((Subtarget.hasAVX2() &&
29848 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29849 (Subtarget.hasAVX512() &&
29850 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29851 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29852 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29853 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29854 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29855 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29856 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29857 Res = DAG.getBitcast(MaskVT, V1);
29858 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29859 return DAG.getBitcast(RootVT, Res);
29862 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29863 // vector as the second source.
29864 if (UnaryShuffle && AllowVariableMask &&
29865 ((Subtarget.hasAVX512() &&
29866 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29867 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29868 (Subtarget.hasVLX() &&
29869 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29870 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29871 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29872 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29873 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29874 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29875 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29876 for (unsigned i = 0; i != NumMaskElts; ++i)
29877 if (Mask[i] == SM_SentinelZero)
29878 Mask[i] = NumMaskElts + i;
29880 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29881 Res = DAG.getBitcast(MaskVT, V1);
29882 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29883 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29884 return DAG.getBitcast(RootVT, Res);
29887 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29888 if (AllowVariableMask && !MaskContainsZeros &&
29889 ((Subtarget.hasAVX512() &&
29890 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29891 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29892 (Subtarget.hasVLX() &&
29893 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29894 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29895 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29896 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29897 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29898 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29899 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29900 V1 = DAG.getBitcast(MaskVT, V1);
29901 V2 = DAG.getBitcast(MaskVT, V2);
29902 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29903 return DAG.getBitcast(RootVT, Res);
29908 // See if we can combine a single input shuffle with zeros to a bit-mask,
29909 // which is much simpler than any shuffle.
29910 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29911 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29912 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29913 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29914 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29915 APInt UndefElts(NumMaskElts, 0);
29916 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29917 for (unsigned i = 0; i != NumMaskElts; ++i) {
29919 if (M == SM_SentinelUndef) {
29920 UndefElts.setBit(i);
29923 if (M == SM_SentinelZero)
29925 EltBits[i] = AllOnes;
29927 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29928 Res = DAG.getBitcast(MaskVT, V1);
29929 unsigned AndOpcode =
29930 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29931 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29932 return DAG.getBitcast(RootVT, Res);
29935 // If we have a single input shuffle with different shuffle patterns in the
29936 // the 128-bit lanes use the variable mask to VPERMILPS.
29937 // TODO Combine other mask types at higher depths.
29938 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29939 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29940 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29941 SmallVector<SDValue, 16> VPermIdx;
29942 for (int M : Mask) {
29944 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29945 VPermIdx.push_back(Idx);
29947 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29948 Res = DAG.getBitcast(MaskVT, V1);
29949 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29950 return DAG.getBitcast(RootVT, Res);
29953 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29954 // to VPERMIL2PD/VPERMIL2PS.
29955 if (AllowVariableMask && Subtarget.hasXOP() &&
29956 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29957 MaskVT == MVT::v8f32)) {
29958 // VPERMIL2 Operation.
29959 // Bits[3] - Match Bit.
29960 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29961 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29962 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29963 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29964 SmallVector<int, 8> VPerm2Idx;
29965 unsigned M2ZImm = 0;
29966 for (int M : Mask) {
29967 if (M == SM_SentinelUndef) {
29968 VPerm2Idx.push_back(-1);
29971 if (M == SM_SentinelZero) {
29973 VPerm2Idx.push_back(8);
29976 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29977 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29978 VPerm2Idx.push_back(Index);
29980 V1 = DAG.getBitcast(MaskVT, V1);
29981 V2 = DAG.getBitcast(MaskVT, V2);
29982 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29983 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29984 DAG.getConstant(M2ZImm, DL, MVT::i8));
29985 return DAG.getBitcast(RootVT, Res);
29988 // If we have 3 or more shuffle instructions or a chain involving a variable
29989 // mask, we can replace them with a single PSHUFB instruction profitably.
29990 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29991 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29992 // more aggressive.
29993 if (UnaryShuffle && AllowVariableMask &&
29994 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29995 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29996 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29997 SmallVector<SDValue, 16> PSHUFBMask;
29998 int NumBytes = RootVT.getSizeInBits() / 8;
29999 int Ratio = NumBytes / NumMaskElts;
30000 for (int i = 0; i < NumBytes; ++i) {
30001 int M = Mask[i / Ratio];
30002 if (M == SM_SentinelUndef) {
30003 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
30006 if (M == SM_SentinelZero) {
30007 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
30010 M = Ratio * M + i % Ratio;
30011 assert((M / 16) == (i / 16) && "Lane crossing detected");
30012 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30014 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
30015 Res = DAG.getBitcast(ByteVT, V1);
30016 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
30017 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
30018 return DAG.getBitcast(RootVT, Res);
30021 // With XOP, if we have a 128-bit binary input shuffle we can always combine
30022 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
30023 // slower than PSHUFB on targets that support both.
30024 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
30025 // VPPERM Mask Operation
30026 // Bits[4:0] - Byte Index (0 - 31)
30027 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
30028 SmallVector<SDValue, 16> VPPERMMask;
30030 int Ratio = NumBytes / NumMaskElts;
30031 for (int i = 0; i < NumBytes; ++i) {
30032 int M = Mask[i / Ratio];
30033 if (M == SM_SentinelUndef) {
30034 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
30037 if (M == SM_SentinelZero) {
30038 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
30041 M = Ratio * M + i % Ratio;
30042 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30044 MVT ByteVT = MVT::v16i8;
30045 V1 = DAG.getBitcast(ByteVT, V1);
30046 V2 = DAG.getBitcast(ByteVT, V2);
30047 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
30048 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
30049 return DAG.getBitcast(RootVT, Res);
30052 // Failed to find any combines.
30056 // Attempt to constant fold all of the constant source ops.
30057 // Returns true if the entire shuffle is folded to a constant.
30058 // TODO: Extend this to merge multiple constant Ops and update the mask.
30059 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
30060 ArrayRef<int> Mask, SDValue Root,
30061 bool HasVariableMask,
30063 const X86Subtarget &Subtarget) {
30064 MVT VT = Root.getSimpleValueType();
30066 unsigned SizeInBits = VT.getSizeInBits();
30067 unsigned NumMaskElts = Mask.size();
30068 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
30069 unsigned NumOps = Ops.size();
30071 // Extract constant bits from each source op.
30072 bool OneUseConstantOp = false;
30073 SmallVector<APInt, 16> UndefEltsOps(NumOps);
30074 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
30075 for (unsigned i = 0; i != NumOps; ++i) {
30076 SDValue SrcOp = Ops[i];
30077 OneUseConstantOp |= SrcOp.hasOneUse();
30078 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
30083 // Only fold if at least one of the constants is only used once or
30084 // the combined shuffle has included a variable mask shuffle, this
30085 // is to avoid constant pool bloat.
30086 if (!OneUseConstantOp && !HasVariableMask)
30089 // Shuffle the constant bits according to the mask.
30090 APInt UndefElts(NumMaskElts, 0);
30091 APInt ZeroElts(NumMaskElts, 0);
30092 APInt ConstantElts(NumMaskElts, 0);
30093 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
30094 APInt::getNullValue(MaskSizeInBits));
30095 for (unsigned i = 0; i != NumMaskElts; ++i) {
30097 if (M == SM_SentinelUndef) {
30098 UndefElts.setBit(i);
30100 } else if (M == SM_SentinelZero) {
30101 ZeroElts.setBit(i);
30104 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
30106 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
30107 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
30109 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
30110 if (SrcUndefElts[SrcMaskIdx]) {
30111 UndefElts.setBit(i);
30115 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
30116 APInt &Bits = SrcEltBits[SrcMaskIdx];
30118 ZeroElts.setBit(i);
30122 ConstantElts.setBit(i);
30123 ConstantBitData[i] = Bits;
30125 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
30127 // Create the constant data.
30129 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
30130 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
30132 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
30134 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
30137 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
30138 return DAG.getBitcast(VT, CstOp);
30141 /// Fully generic combining of x86 shuffle instructions.
30143 /// This should be the last combine run over the x86 shuffle instructions. Once
30144 /// they have been fully optimized, this will recursively consider all chains
30145 /// of single-use shuffle instructions, build a generic model of the cumulative
30146 /// shuffle operation, and check for simpler instructions which implement this
30147 /// operation. We use this primarily for two purposes:
30149 /// 1) Collapse generic shuffles to specialized single instructions when
30150 /// equivalent. In most cases, this is just an encoding size win, but
30151 /// sometimes we will collapse multiple generic shuffles into a single
30152 /// special-purpose shuffle.
30153 /// 2) Look for sequences of shuffle instructions with 3 or more total
30154 /// instructions, and replace them with the slightly more expensive SSSE3
30155 /// PSHUFB instruction if available. We do this as the last combining step
30156 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30157 /// a suitable short sequence of other instructions. The PSHUFB will either
30158 /// use a register or have to read from memory and so is slightly (but only
30159 /// slightly) more expensive than the other shuffle instructions.
30161 /// Because this is inherently a quadratic operation (for each shuffle in
30162 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
30163 /// This should never be an issue in practice as the shuffle lowering doesn't
30164 /// produce sequences of more than 8 instructions.
30166 /// FIXME: We will currently miss some cases where the redundant shuffling
30167 /// would simplify under the threshold for PSHUFB formation because of
30168 /// combine-ordering. To fix this, we should do the redundant instruction
30169 /// combining in this recursive walk.
30170 static SDValue combineX86ShufflesRecursively(
30171 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
30172 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
30173 bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
30174 // Bound the depth of our recursive combine because this is ultimately
30175 // quadratic in nature.
30176 const unsigned MaxRecursionDepth = 8;
30177 if (Depth > MaxRecursionDepth)
30180 // Directly rip through bitcasts to find the underlying operand.
30181 SDValue Op = SrcOps[SrcOpIndex];
30182 Op = peekThroughOneUseBitcasts(Op);
30184 MVT VT = Op.getSimpleValueType();
30185 if (!VT.isVector())
30186 return SDValue(); // Bail if we hit a non-vector.
30188 assert(Root.getSimpleValueType().isVector() &&
30189 "Shuffles operate on vector types!");
30190 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
30191 "Can only combine shuffles of the same vector register size.");
30193 // Extract target shuffle mask and resolve sentinels and inputs.
30194 SmallVector<int, 64> OpMask;
30195 SmallVector<SDValue, 2> OpInputs;
30196 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
30199 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
30200 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
30201 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
30203 // Add the inputs to the Ops list, avoiding duplicates.
30204 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
30206 int InputIdx0 = -1, InputIdx1 = -1;
30207 for (int i = 0, e = Ops.size(); i < e; ++i) {
30208 SDValue BC = peekThroughBitcasts(Ops[i]);
30209 if (Input0 && BC == peekThroughBitcasts(Input0))
30211 if (Input1 && BC == peekThroughBitcasts(Input1))
30215 if (Input0 && InputIdx0 < 0) {
30216 InputIdx0 = SrcOpIndex;
30217 Ops[SrcOpIndex] = Input0;
30219 if (Input1 && InputIdx1 < 0) {
30220 InputIdx1 = Ops.size();
30221 Ops.push_back(Input1);
30224 assert(((RootMask.size() > OpMask.size() &&
30225 RootMask.size() % OpMask.size() == 0) ||
30226 (OpMask.size() > RootMask.size() &&
30227 OpMask.size() % RootMask.size() == 0) ||
30228 OpMask.size() == RootMask.size()) &&
30229 "The smaller number of elements must divide the larger.");
30231 // This function can be performance-critical, so we rely on the power-of-2
30232 // knowledge that we have about the mask sizes to replace div/rem ops with
30233 // bit-masks and shifts.
30234 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
30235 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
30236 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
30237 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
30239 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
30240 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
30241 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
30242 assert((RootRatio == 1 || OpRatio == 1) &&
30243 "Must not have a ratio for both incoming and op masks!");
30245 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
30246 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
30247 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
30248 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
30249 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
30251 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
30253 // Merge this shuffle operation's mask into our accumulated mask. Note that
30254 // this shuffle's mask will be the first applied to the input, followed by the
30255 // root mask to get us all the way to the root value arrangement. The reason
30256 // for this order is that we are recursing up the operation chain.
30257 for (unsigned i = 0; i < MaskWidth; ++i) {
30258 unsigned RootIdx = i >> RootRatioLog2;
30259 if (RootMask[RootIdx] < 0) {
30260 // This is a zero or undef lane, we're done.
30261 Mask[i] = RootMask[RootIdx];
30265 unsigned RootMaskedIdx =
30267 ? RootMask[RootIdx]
30268 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
30270 // Just insert the scaled root mask value if it references an input other
30271 // than the SrcOp we're currently inserting.
30272 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
30273 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
30274 Mask[i] = RootMaskedIdx;
30278 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
30279 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
30280 if (OpMask[OpIdx] < 0) {
30281 // The incoming lanes are zero or undef, it doesn't matter which ones we
30283 Mask[i] = OpMask[OpIdx];
30287 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
30288 unsigned OpMaskedIdx =
30291 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
30293 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
30294 if (OpMask[OpIdx] < (int)OpMask.size()) {
30295 assert(0 <= InputIdx0 && "Unknown target shuffle input");
30296 OpMaskedIdx += InputIdx0 * MaskWidth;
30298 assert(0 <= InputIdx1 && "Unknown target shuffle input");
30299 OpMaskedIdx += InputIdx1 * MaskWidth;
30302 Mask[i] = OpMaskedIdx;
30305 // Handle the all undef/zero cases early.
30306 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
30307 return DAG.getUNDEF(Root.getValueType());
30309 // TODO - should we handle the mixed zero/undef case as well? Just returning
30310 // a zero mask will lose information on undef elements possibly reducing
30311 // future combine possibilities.
30312 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
30313 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
30316 // Remove unused shuffle source ops.
30317 resolveTargetShuffleInputsAndMask(Ops, Mask);
30318 assert(!Ops.empty() && "Shuffle with no inputs detected");
30320 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
30322 // Update the list of shuffle nodes that have been combined so far.
30323 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
30325 CombinedNodes.push_back(Op.getNode());
30327 // See if we can recurse into each shuffle source op (if it's a target
30328 // shuffle). The source op should only be combined if it either has a
30329 // single use (i.e. current Op) or all its users have already been combined.
30330 // Don't recurse if we already have more source ops than we can combine in
30331 // the remaining recursion depth.
30332 if (Ops.size() < (MaxRecursionDepth - Depth)) {
30333 for (int i = 0, e = Ops.size(); i < e; ++i)
30334 if (Ops[i].getNode()->hasOneUse() ||
30335 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
30336 if (SDValue Res = combineX86ShufflesRecursively(
30337 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
30342 // Attempt to constant fold all of the constant source ops.
30343 if (SDValue Cst = combineX86ShufflesConstants(
30344 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
30347 // We can only combine unary and binary shuffle mask cases.
30348 if (Ops.size() > 2)
30351 // Minor canonicalization of the accumulated shuffle mask to make it easier
30352 // to match below. All this does is detect masks with sequential pairs of
30353 // elements, and shrink them to the half-width mask. It does this in a loop
30354 // so it will reduce the size of the mask to the minimal width mask which
30355 // performs an equivalent shuffle.
30356 SmallVector<int, 64> WidenedMask;
30357 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
30358 Mask = std::move(WidenedMask);
30361 // Canonicalization of binary shuffle masks to improve pattern matching by
30362 // commuting the inputs.
30363 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
30364 ShuffleVectorSDNode::commuteMask(Mask);
30365 std::swap(Ops[0], Ops[1]);
30368 // Finally, try to combine into a single shuffle instruction.
30369 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
30373 /// Get the PSHUF-style mask from PSHUF node.
30375 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
30376 /// PSHUF-style masks that can be reused with such instructions.
30377 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
30378 MVT VT = N.getSimpleValueType();
30379 SmallVector<int, 4> Mask;
30380 SmallVector<SDValue, 2> Ops;
30383 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
30387 // If we have more than 128-bits, only the low 128-bits of shuffle mask
30388 // matter. Check that the upper masks are repeats and remove them.
30389 if (VT.getSizeInBits() > 128) {
30390 int LaneElts = 128 / VT.getScalarSizeInBits();
30392 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
30393 for (int j = 0; j < LaneElts; ++j)
30394 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
30395 "Mask doesn't repeat in high 128-bit lanes!");
30397 Mask.resize(LaneElts);
30400 switch (N.getOpcode()) {
30401 case X86ISD::PSHUFD:
30403 case X86ISD::PSHUFLW:
30406 case X86ISD::PSHUFHW:
30407 Mask.erase(Mask.begin(), Mask.begin() + 4);
30408 for (int &M : Mask)
30412 llvm_unreachable("No valid shuffle instruction found!");
30416 /// Search for a combinable shuffle across a chain ending in pshufd.
30418 /// We walk up the chain and look for a combinable shuffle, skipping over
30419 /// shuffles that we could hoist this shuffle's transformation past without
30420 /// altering anything.
30422 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
30423 SelectionDAG &DAG) {
30424 assert(N.getOpcode() == X86ISD::PSHUFD &&
30425 "Called with something other than an x86 128-bit half shuffle!");
30428 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
30429 // of the shuffles in the chain so that we can form a fresh chain to replace
30431 SmallVector<SDValue, 8> Chain;
30432 SDValue V = N.getOperand(0);
30433 for (; V.hasOneUse(); V = V.getOperand(0)) {
30434 switch (V.getOpcode()) {
30436 return SDValue(); // Nothing combined!
30439 // Skip bitcasts as we always know the type for the target specific
30443 case X86ISD::PSHUFD:
30444 // Found another dword shuffle.
30447 case X86ISD::PSHUFLW:
30448 // Check that the low words (being shuffled) are the identity in the
30449 // dword shuffle, and the high words are self-contained.
30450 if (Mask[0] != 0 || Mask[1] != 1 ||
30451 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
30454 Chain.push_back(V);
30457 case X86ISD::PSHUFHW:
30458 // Check that the high words (being shuffled) are the identity in the
30459 // dword shuffle, and the low words are self-contained.
30460 if (Mask[2] != 2 || Mask[3] != 3 ||
30461 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
30464 Chain.push_back(V);
30467 case X86ISD::UNPCKL:
30468 case X86ISD::UNPCKH:
30469 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
30470 // shuffle into a preceding word shuffle.
30471 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
30472 V.getSimpleValueType().getVectorElementType() != MVT::i16)
30475 // Search for a half-shuffle which we can combine with.
30476 unsigned CombineOp =
30477 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
30478 if (V.getOperand(0) != V.getOperand(1) ||
30479 !V->isOnlyUserOf(V.getOperand(0).getNode()))
30481 Chain.push_back(V);
30482 V = V.getOperand(0);
30484 switch (V.getOpcode()) {
30486 return SDValue(); // Nothing to combine.
30488 case X86ISD::PSHUFLW:
30489 case X86ISD::PSHUFHW:
30490 if (V.getOpcode() == CombineOp)
30493 Chain.push_back(V);
30497 V = V.getOperand(0);
30501 } while (V.hasOneUse());
30504 // Break out of the loop if we break out of the switch.
30508 if (!V.hasOneUse())
30509 // We fell out of the loop without finding a viable combining instruction.
30512 // Merge this node's mask and our incoming mask.
30513 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30514 for (int &M : Mask)
30516 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
30517 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30519 // Rebuild the chain around this new shuffle.
30520 while (!Chain.empty()) {
30521 SDValue W = Chain.pop_back_val();
30523 if (V.getValueType() != W.getOperand(0).getValueType())
30524 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
30526 switch (W.getOpcode()) {
30528 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30530 case X86ISD::UNPCKL:
30531 case X86ISD::UNPCKH:
30532 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30535 case X86ISD::PSHUFD:
30536 case X86ISD::PSHUFLW:
30537 case X86ISD::PSHUFHW:
30538 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30542 if (V.getValueType() != N.getValueType())
30543 V = DAG.getBitcast(N.getValueType(), V);
30545 // Return the new chain to replace N.
30549 /// Search for a combinable shuffle across a chain ending in pshuflw or
30552 /// We walk up the chain, skipping shuffles of the other half and looking
30553 /// through shuffles which switch halves trying to find a shuffle of the same
30554 /// pair of dwords.
30555 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30557 TargetLowering::DAGCombinerInfo &DCI) {
30559 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30560 "Called with something other than an x86 128-bit half shuffle!");
30562 unsigned CombineOpcode = N.getOpcode();
30564 // Walk up a single-use chain looking for a combinable shuffle.
30565 SDValue V = N.getOperand(0);
30566 for (; V.hasOneUse(); V = V.getOperand(0)) {
30567 switch (V.getOpcode()) {
30569 return false; // Nothing combined!
30572 // Skip bitcasts as we always know the type for the target specific
30576 case X86ISD::PSHUFLW:
30577 case X86ISD::PSHUFHW:
30578 if (V.getOpcode() == CombineOpcode)
30581 // Other-half shuffles are no-ops.
30584 // Break out of the loop if we break out of the switch.
30588 if (!V.hasOneUse())
30589 // We fell out of the loop without finding a viable combining instruction.
30592 // Combine away the bottom node as its shuffle will be accumulated into
30593 // a preceding shuffle.
30594 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30596 // Record the old value.
30599 // Merge this node's mask and our incoming mask (adjusted to account for all
30600 // the pshufd instructions encountered).
30601 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30602 for (int &M : Mask)
30604 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30605 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30607 // Check that the shuffles didn't cancel each other out. If not, we need to
30608 // combine to the new one.
30610 // Replace the combinable shuffle with the combined one, updating all users
30611 // so that we re-evaluate the chain here.
30612 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30617 /// Try to combine x86 target specific shuffles.
30618 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30619 TargetLowering::DAGCombinerInfo &DCI,
30620 const X86Subtarget &Subtarget) {
30622 MVT VT = N.getSimpleValueType();
30623 SmallVector<int, 4> Mask;
30624 unsigned Opcode = N.getOpcode();
30626 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30627 // single instruction.
30628 if (VT.getScalarSizeInBits() == 64 &&
30629 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30630 Opcode == X86ISD::UNPCKL)) {
30631 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30632 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30633 EVT VT0 = BC0.getValueType();
30634 EVT VT1 = BC1.getValueType();
30635 unsigned Opcode0 = BC0.getOpcode();
30636 unsigned Opcode1 = BC1.getOpcode();
30637 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30638 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30639 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30640 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30642 if (Opcode == X86ISD::MOVSD) {
30643 Lo = BC1.getOperand(0);
30644 Hi = BC0.getOperand(1);
30646 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30647 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30649 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30650 return DAG.getBitcast(VT, Horiz);
30655 case X86ISD::VBROADCAST: {
30656 // If broadcasting from another shuffle, attempt to simplify it.
30657 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30658 SDValue Src = N.getOperand(0);
30659 SDValue BC = peekThroughBitcasts(Src);
30660 EVT SrcVT = Src.getValueType();
30661 EVT BCVT = BC.getValueType();
30662 if (isTargetShuffle(BC.getOpcode()) &&
30663 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30664 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30665 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30667 for (unsigned i = 0; i != Scale; ++i)
30668 DemandedMask[i] = i;
30669 if (SDValue Res = combineX86ShufflesRecursively(
30670 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30671 /*HasVarMask*/ false, DAG, Subtarget))
30672 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30673 DAG.getBitcast(SrcVT, Res));
30677 case X86ISD::PSHUFD:
30678 case X86ISD::PSHUFLW:
30679 case X86ISD::PSHUFHW:
30680 Mask = getPSHUFShuffleMask(N);
30681 assert(Mask.size() == 4);
30683 case X86ISD::UNPCKL: {
30684 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30685 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30686 // moves upper half elements into the lower half part. For example:
30688 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30690 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30692 // will be combined to:
30694 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30696 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30697 // happen due to advanced instructions.
30698 if (!VT.is128BitVector())
30701 auto Op0 = N.getOperand(0);
30702 auto Op1 = N.getOperand(1);
30703 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30704 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30706 unsigned NumElts = VT.getVectorNumElements();
30707 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30708 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30711 auto ShufOp = Op1.getOperand(0);
30712 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30713 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30717 case X86ISD::MOVSD:
30718 case X86ISD::MOVSS: {
30719 SDValue N0 = N.getOperand(0);
30720 SDValue N1 = N.getOperand(1);
30722 // Canonicalize scalar FPOps:
30723 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
30724 // If commutable, allow OP(N1[0], N0[0]).
30725 unsigned Opcode1 = N1.getOpcode();
30726 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
30727 Opcode1 == ISD::FDIV) {
30728 SDValue N10 = N1.getOperand(0);
30729 SDValue N11 = N1.getOperand(1);
30731 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
30733 std::swap(N10, N11);
30734 MVT SVT = VT.getVectorElementType();
30735 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
30736 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
30737 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
30738 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
30739 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
30740 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
30746 case X86ISD::INSERTPS: {
30747 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30748 SDValue Op0 = N.getOperand(0);
30749 SDValue Op1 = N.getOperand(1);
30750 SDValue Op2 = N.getOperand(2);
30751 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30752 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30753 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30754 unsigned ZeroMask = InsertPSMask & 0xF;
30756 // If we zero out all elements from Op0 then we don't need to reference it.
30757 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30758 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30759 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30761 // If we zero out the element from Op1 then we don't need to reference it.
30762 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30763 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30764 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30766 // Attempt to merge insertps Op1 with an inner target shuffle node.
30767 SmallVector<int, 8> TargetMask1;
30768 SmallVector<SDValue, 2> Ops1;
30769 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30770 int M = TargetMask1[SrcIdx];
30771 if (isUndefOrZero(M)) {
30772 // Zero/UNDEF insertion - zero out element and remove dependency.
30773 InsertPSMask |= (1u << DstIdx);
30774 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30775 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30777 // Update insertps mask srcidx and reference the source input directly.
30778 assert(0 <= M && M < 8 && "Shuffle index out of range");
30779 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30780 Op1 = Ops1[M < 4 ? 0 : 1];
30781 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30782 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30785 // Attempt to merge insertps Op0 with an inner target shuffle node.
30786 SmallVector<int, 8> TargetMask0;
30787 SmallVector<SDValue, 2> Ops0;
30788 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30791 bool Updated = false;
30792 bool UseInput00 = false;
30793 bool UseInput01 = false;
30794 for (int i = 0; i != 4; ++i) {
30795 int M = TargetMask0[i];
30796 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30797 // No change if element is already zero or the inserted element.
30799 } else if (isUndefOrZero(M)) {
30800 // If the target mask is undef/zero then we must zero the element.
30801 InsertPSMask |= (1u << i);
30806 // The input vector element must be inline.
30807 if (M != i && M != (i + 4))
30810 // Determine which inputs of the target shuffle we're using.
30811 UseInput00 |= (0 <= M && M < 4);
30812 UseInput01 |= (4 <= M);
30815 // If we're not using both inputs of the target shuffle then use the
30816 // referenced input directly.
30817 if (UseInput00 && !UseInput01) {
30820 } else if (!UseInput00 && UseInput01) {
30826 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30827 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30835 // Nuke no-op shuffles that show up after combining.
30836 if (isNoopShuffleMask(Mask))
30837 return N.getOperand(0);
30839 // Look for simplifications involving one or two shuffle instructions.
30840 SDValue V = N.getOperand(0);
30841 switch (N.getOpcode()) {
30844 case X86ISD::PSHUFLW:
30845 case X86ISD::PSHUFHW:
30846 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30848 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30849 return SDValue(); // We combined away this shuffle, so we're done.
30851 // See if this reduces to a PSHUFD which is no more expensive and can
30852 // combine with more operations. Note that it has to at least flip the
30853 // dwords as otherwise it would have been removed as a no-op.
30854 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30855 int DMask[] = {0, 1, 2, 3};
30856 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30857 DMask[DOffset + 0] = DOffset + 1;
30858 DMask[DOffset + 1] = DOffset + 0;
30859 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30860 V = DAG.getBitcast(DVT, V);
30861 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30862 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30863 return DAG.getBitcast(VT, V);
30866 // Look for shuffle patterns which can be implemented as a single unpack.
30867 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30868 // only works when we have a PSHUFD followed by two half-shuffles.
30869 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30870 (V.getOpcode() == X86ISD::PSHUFLW ||
30871 V.getOpcode() == X86ISD::PSHUFHW) &&
30872 V.getOpcode() != N.getOpcode() &&
30874 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30875 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30876 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30877 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30878 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30879 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30881 for (int i = 0; i < 4; ++i) {
30882 WordMask[i + NOffset] = Mask[i] + NOffset;
30883 WordMask[i + VOffset] = VMask[i] + VOffset;
30885 // Map the word mask through the DWord mask.
30887 for (int i = 0; i < 8; ++i)
30888 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30889 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30890 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30891 // We can replace all three shuffles with an unpack.
30892 V = DAG.getBitcast(VT, D.getOperand(0));
30893 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30902 case X86ISD::PSHUFD:
30903 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30912 /// Checks if the shuffle mask takes subsequent elements
30913 /// alternately from two vectors.
30914 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
30915 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
30917 int ParitySrc[2] = {-1, -1};
30918 unsigned Size = Mask.size();
30919 for (unsigned i = 0; i != Size; ++i) {
30924 // Make sure we are using the matching element from the input.
30925 if ((M % Size) != i)
30928 // Make sure we use the same input for all elements of the same parity.
30929 int Src = M / Size;
30930 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30932 ParitySrc[i % 2] = Src;
30935 // Make sure each input is used.
30936 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30939 Op0Even = ParitySrc[0] == 0;
30943 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30944 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30945 /// are written to the parameters \p Opnd0 and \p Opnd1.
30947 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30948 /// so it is easier to generically match. We also insert dummy vector shuffle
30949 /// nodes for the operands which explicitly discard the lanes which are unused
30950 /// by this operation to try to flow through the rest of the combiner
30951 /// the fact that they're unused.
30952 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30953 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30956 EVT VT = N->getValueType(0);
30957 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30958 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30959 !VT.getSimpleVT().isFloatingPoint())
30962 // We only handle target-independent shuffles.
30963 // FIXME: It would be easy and harmless to use the target shuffle mask
30964 // extraction tool to support more.
30965 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30968 SDValue V1 = N->getOperand(0);
30969 SDValue V2 = N->getOperand(1);
30971 // Make sure we have an FADD and an FSUB.
30972 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30973 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30974 V1.getOpcode() == V2.getOpcode())
30977 // If there are other uses of these operations we can't fold them.
30978 if (!V1->hasOneUse() || !V2->hasOneUse())
30981 // Ensure that both operations have the same operands. Note that we can
30982 // commute the FADD operands.
30984 if (V1.getOpcode() == ISD::FSUB) {
30985 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30986 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30987 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30990 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
30991 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
30992 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
30993 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
30997 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30999 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31002 // It's a subadd if the vector in the even parity is an FADD.
31003 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
31004 : V2->getOpcode() == ISD::FADD;
31011 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
31012 static SDValue combineShuffleToFMAddSub(SDNode *N,
31013 const X86Subtarget &Subtarget,
31014 SelectionDAG &DAG) {
31015 // We only handle target-independent shuffles.
31016 // FIXME: It would be easy and harmless to use the target shuffle mask
31017 // extraction tool to support more.
31018 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
31021 MVT VT = N->getSimpleValueType(0);
31022 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31023 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
31026 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
31027 SDValue Op0 = N->getOperand(0);
31028 SDValue Op1 = N->getOperand(1);
31029 SDValue FMAdd = Op0, FMSub = Op1;
31030 if (FMSub.getOpcode() != X86ISD::FMSUB)
31031 std::swap(FMAdd, FMSub);
31033 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
31034 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
31035 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
31036 FMAdd.getOperand(2) != FMSub.getOperand(2))
31039 // Check for correct shuffle mask.
31040 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31042 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31045 // FMAddSub takes zeroth operand from FMSub node.
31047 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
31048 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31049 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
31050 FMAdd.getOperand(2));
31053 /// Try to combine a shuffle into a target-specific add-sub or
31054 /// mul-add-sub node.
31055 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
31056 const X86Subtarget &Subtarget,
31057 SelectionDAG &DAG) {
31058 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
31061 SDValue Opnd0, Opnd1;
31063 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
31066 MVT VT = N->getSimpleValueType(0);
31069 // Try to generate X86ISD::FMADDSUB node here.
31071 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
31072 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31073 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
31079 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
31080 // the ADDSUB idiom has been successfully recognized. There are no known
31081 // X86 targets with 512-bit ADDSUB instructions!
31082 if (VT.is512BitVector())
31085 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
31088 // We are looking for a shuffle where both sources are concatenated with undef
31089 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
31090 // if we can express this as a single-source shuffle, that's preferable.
31091 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
31092 const X86Subtarget &Subtarget) {
31093 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
31096 EVT VT = N->getValueType(0);
31098 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
31099 if (!VT.is128BitVector() && !VT.is256BitVector())
31102 if (VT.getVectorElementType() != MVT::i32 &&
31103 VT.getVectorElementType() != MVT::i64 &&
31104 VT.getVectorElementType() != MVT::f32 &&
31105 VT.getVectorElementType() != MVT::f64)
31108 SDValue N0 = N->getOperand(0);
31109 SDValue N1 = N->getOperand(1);
31111 // Check that both sources are concats with undef.
31112 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
31113 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
31114 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
31115 !N1.getOperand(1).isUndef())
31118 // Construct the new shuffle mask. Elements from the first source retain their
31119 // index, but elements from the second source no longer need to skip an undef.
31120 SmallVector<int, 8> Mask;
31121 int NumElts = VT.getVectorNumElements();
31123 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31124 for (int Elt : SVOp->getMask())
31125 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
31128 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
31130 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
31133 /// Eliminate a redundant shuffle of a horizontal math op.
31134 static SDValue foldShuffleOfHorizOp(SDNode *N) {
31135 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
31138 SDValue HOp = N->getOperand(0);
31139 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
31140 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
31143 // 128-bit horizontal math instructions are defined to operate on adjacent
31144 // lanes of each operand as:
31145 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
31146 // ...similarly for v2f64 and v8i16.
31147 // TODO: Handle UNDEF operands.
31148 if (HOp.getOperand(0) != HOp.getOperand(1))
31151 // When the operands of a horizontal math op are identical, the low half of
31152 // the result is the same as the high half. If the shuffle is also replicating
31153 // low and high halves, we don't need the shuffle.
31154 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
31155 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31156 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
31157 // but this should be tied to whatever horizontal op matching and shuffle
31158 // canonicalization are producing.
31159 if (HOp.getValueSizeInBits() == 128 &&
31160 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
31161 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
31162 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
31165 if (HOp.getValueSizeInBits() == 256 &&
31166 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
31167 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
31168 isTargetShuffleEquivalent(
31169 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
31175 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
31176 TargetLowering::DAGCombinerInfo &DCI,
31177 const X86Subtarget &Subtarget) {
31179 EVT VT = N->getValueType(0);
31180 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31181 // If we have legalized the vector types, look for blends of FADD and FSUB
31182 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
31183 if (TLI.isTypeLegal(VT)) {
31184 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
31187 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
31191 // During Type Legalization, when promoting illegal vector types,
31192 // the backend might introduce new shuffle dag nodes and bitcasts.
31194 // This code performs the following transformation:
31195 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
31196 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
31198 // We do this only if both the bitcast and the BINOP dag nodes have
31199 // one use. Also, perform this transformation only if the new binary
31200 // operation is legal. This is to avoid introducing dag nodes that
31201 // potentially need to be further expanded (or custom lowered) into a
31202 // less optimal sequence of dag nodes.
31203 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
31204 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
31205 N->getOperand(0).getOpcode() == ISD::BITCAST &&
31206 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
31207 SDValue N0 = N->getOperand(0);
31208 SDValue N1 = N->getOperand(1);
31210 SDValue BC0 = N0.getOperand(0);
31211 EVT SVT = BC0.getValueType();
31212 unsigned Opcode = BC0.getOpcode();
31213 unsigned NumElts = VT.getVectorNumElements();
31215 if (BC0.hasOneUse() && SVT.isVector() &&
31216 SVT.getVectorNumElements() * 2 == NumElts &&
31217 TLI.isOperationLegal(Opcode, VT)) {
31218 bool CanFold = false;
31224 // isOperationLegal lies for integer ops on floating point types.
31225 CanFold = VT.isInteger();
31230 // isOperationLegal lies for floating point ops on integer types.
31231 CanFold = VT.isFloatingPoint();
31235 unsigned SVTNumElts = SVT.getVectorNumElements();
31236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31237 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
31238 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
31239 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
31240 CanFold = SVOp->getMaskElt(i) < 0;
31243 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
31244 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
31245 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
31246 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
31251 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
31252 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
31253 // consecutive, non-overlapping, and in the right order.
31254 SmallVector<SDValue, 16> Elts;
31255 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
31256 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
31257 Elts.push_back(Elt);
31264 if (Elts.size() == VT.getVectorNumElements())
31266 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
31269 // For AVX2, we sometimes want to combine
31270 // (vector_shuffle <mask> (concat_vectors t1, undef)
31271 // (concat_vectors t2, undef))
31273 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
31274 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
31275 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
31278 if (isTargetShuffle(N->getOpcode())) {
31280 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
31283 // Try recursively combining arbitrary sequences of x86 shuffle
31284 // instructions into higher-order shuffles. We do this after combining
31285 // specific PSHUF instruction sequences into their minimal form so that we
31286 // can evaluate how many specialized shuffle instructions are involved in
31287 // a particular chain.
31288 if (SDValue Res = combineX86ShufflesRecursively(
31289 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31290 /*HasVarMask*/ false, DAG, Subtarget))
31297 /// Check if a vector extract from a target-specific shuffle of a load can be
31298 /// folded into a single element load.
31299 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
31300 /// shuffles have been custom lowered so we need to handle those here.
31301 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
31302 TargetLowering::DAGCombinerInfo &DCI) {
31303 if (DCI.isBeforeLegalizeOps())
31306 SDValue InVec = N->getOperand(0);
31307 SDValue EltNo = N->getOperand(1);
31308 EVT EltVT = N->getValueType(0);
31310 if (!isa<ConstantSDNode>(EltNo))
31313 EVT OriginalVT = InVec.getValueType();
31315 // Peek through bitcasts, don't duplicate a load with other uses.
31316 InVec = peekThroughOneUseBitcasts(InVec);
31318 EVT CurrentVT = InVec.getValueType();
31319 if (!CurrentVT.isVector() ||
31320 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
31323 if (!isTargetShuffle(InVec.getOpcode()))
31326 // Don't duplicate a load with other uses.
31327 if (!InVec.hasOneUse())
31330 SmallVector<int, 16> ShuffleMask;
31331 SmallVector<SDValue, 2> ShuffleOps;
31333 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
31334 ShuffleOps, ShuffleMask, UnaryShuffle))
31337 // Select the input vector, guarding against out of range extract vector.
31338 unsigned NumElems = CurrentVT.getVectorNumElements();
31339 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
31340 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
31342 if (Idx == SM_SentinelZero)
31343 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
31344 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
31345 if (Idx == SM_SentinelUndef)
31346 return DAG.getUNDEF(EltVT);
31348 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
31349 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
31352 // If inputs to shuffle are the same for both ops, then allow 2 uses
31353 unsigned AllowedUses =
31354 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
31356 if (LdNode.getOpcode() == ISD::BITCAST) {
31357 // Don't duplicate a load with other uses.
31358 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
31361 AllowedUses = 1; // only allow 1 load use if we have a bitcast
31362 LdNode = LdNode.getOperand(0);
31365 if (!ISD::isNormalLoad(LdNode.getNode()))
31368 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
31370 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
31373 // If there's a bitcast before the shuffle, check if the load type and
31374 // alignment is valid.
31375 unsigned Align = LN0->getAlignment();
31376 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31377 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
31378 EltVT.getTypeForEVT(*DAG.getContext()));
31380 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
31383 // All checks match so transform back to vector_shuffle so that DAG combiner
31384 // can finish the job
31387 // Create shuffle node taking into account the case that its a unary shuffle
31388 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
31389 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
31391 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
31392 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
31396 // Try to match patterns such as
31397 // (i16 bitcast (v16i1 x))
31399 // (i16 movmsk (16i8 sext (v16i1 x)))
31400 // before the illegal vector is scalarized on subtargets that don't have legal
31402 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
31403 const X86Subtarget &Subtarget) {
31404 EVT VT = BitCast.getValueType();
31405 SDValue N0 = BitCast.getOperand(0);
31406 EVT VecVT = N0->getValueType(0);
31408 if (!VT.isScalarInteger() || !VecVT.isSimple())
31411 // With AVX512 vxi1 types are legal and we prefer using k-regs.
31412 // MOVMSK is supported in SSE2 or later.
31413 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
31416 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
31417 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
31418 // v8i16 and v16i16.
31419 // For these two cases, we can shuffle the upper element bytes to a
31420 // consecutive sequence at the start of the vector and treat the results as
31421 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
31422 // for v16i16 this is not the case, because the shuffle is expensive, so we
31423 // avoid sign-extending to this type entirely.
31424 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
31425 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
31427 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
31428 switch (VecVT.getSimpleVT().SimpleTy) {
31432 SExtVT = MVT::v2i64;
31433 FPCastVT = MVT::v2f64;
31436 SExtVT = MVT::v4i32;
31437 FPCastVT = MVT::v4f32;
31438 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
31439 // sign-extend to a 256-bit operation to avoid truncation.
31440 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31441 N0->getOperand(0).getValueType().is256BitVector()) {
31442 SExtVT = MVT::v4i64;
31443 FPCastVT = MVT::v4f64;
31447 SExtVT = MVT::v8i16;
31448 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
31449 // sign-extend to a 256-bit operation to match the compare.
31450 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
31451 // 256-bit because the shuffle is cheaper than sign extending the result of
31453 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31454 (N0->getOperand(0).getValueType().is256BitVector() ||
31455 N0->getOperand(0).getValueType().is512BitVector())) {
31456 SExtVT = MVT::v8i32;
31457 FPCastVT = MVT::v8f32;
31461 SExtVT = MVT::v16i8;
31462 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
31463 // it is not profitable to sign-extend to 256-bit because this will
31464 // require an extra cross-lane shuffle which is more expensive than
31465 // truncating the result of the compare to 128-bits.
31468 SExtVT = MVT::v32i8;
31473 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
31475 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
31476 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31477 return DAG.getZExtOrTrunc(V, DL, VT);
31480 if (SExtVT == MVT::v8i16) {
31481 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
31482 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
31483 DAG.getUNDEF(MVT::v8i16));
31485 assert(SExtVT.getScalarType() != MVT::i16 &&
31486 "Vectors of i16 must be packed");
31487 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
31488 V = DAG.getBitcast(FPCastVT, V);
31489 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31490 return DAG.getZExtOrTrunc(V, DL, VT);
31493 // Convert a vXi1 constant build vector to the same width scalar integer.
31494 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
31495 EVT SrcVT = Op.getValueType();
31496 assert(SrcVT.getVectorElementType() == MVT::i1 &&
31497 "Expected a vXi1 vector");
31498 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
31499 "Expected a constant build vector");
31501 APInt Imm(SrcVT.getVectorNumElements(), 0);
31502 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
31503 SDValue In = Op.getOperand(Idx);
31504 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
31507 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
31508 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
31511 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31512 TargetLowering::DAGCombinerInfo &DCI,
31513 const X86Subtarget &Subtarget) {
31514 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
31516 if (!DCI.isBeforeLegalizeOps())
31519 // Only do this if we have k-registers.
31520 if (!Subtarget.hasAVX512())
31523 EVT DstVT = N->getValueType(0);
31524 SDValue Op = N->getOperand(0);
31525 EVT SrcVT = Op.getValueType();
31527 if (!Op.hasOneUse())
31530 // Look for logic ops.
31531 if (Op.getOpcode() != ISD::AND &&
31532 Op.getOpcode() != ISD::OR &&
31533 Op.getOpcode() != ISD::XOR)
31536 // Make sure we have a bitcast between mask registers and a scalar type.
31537 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31538 DstVT.isScalarInteger()) &&
31539 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
31540 SrcVT.isScalarInteger()))
31543 SDValue LHS = Op.getOperand(0);
31544 SDValue RHS = Op.getOperand(1);
31546 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
31547 LHS.getOperand(0).getValueType() == DstVT)
31548 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
31549 DAG.getBitcast(DstVT, RHS));
31551 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
31552 RHS.getOperand(0).getValueType() == DstVT)
31553 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31554 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
31556 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
31557 // Most of these have to move a constant from the scalar domain anyway.
31558 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
31559 RHS = combinevXi1ConstantToInteger(RHS, DAG);
31560 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31561 DAG.getBitcast(DstVT, LHS), RHS);
31567 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31568 const X86Subtarget &Subtarget) {
31570 unsigned NumElts = N.getNumOperands();
31572 auto *BV = cast<BuildVectorSDNode>(N);
31573 SDValue Splat = BV->getSplatValue();
31575 // Build MMX element from integer GPR or SSE float values.
31576 auto CreateMMXElement = [&](SDValue V) {
31578 return DAG.getUNDEF(MVT::x86mmx);
31579 if (V.getValueType().isFloatingPoint()) {
31580 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31581 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31582 V = DAG.getBitcast(MVT::v2i64, V);
31583 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31585 V = DAG.getBitcast(MVT::i32, V);
31587 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31589 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31592 // Convert build vector ops to MMX data in the bottom elements.
31593 SmallVector<SDValue, 8> Ops;
31595 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31597 if (Splat.isUndef())
31598 return DAG.getUNDEF(MVT::x86mmx);
31600 Splat = CreateMMXElement(Splat);
31602 if (Subtarget.hasSSE1()) {
31603 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31605 Splat = DAG.getNode(
31606 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31607 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31610 // Use PSHUFW to repeat 16-bit elements.
31611 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31612 return DAG.getNode(
31613 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31614 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31615 DAG.getConstant(ShufMask, DL, MVT::i8));
31617 Ops.append(NumElts, Splat);
31619 for (unsigned i = 0; i != NumElts; ++i)
31620 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31623 // Use tree of PUNPCKLs to build up general MMX vector.
31624 while (Ops.size() > 1) {
31625 unsigned NumOps = Ops.size();
31626 unsigned IntrinOp =
31627 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31628 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31629 : Intrinsic::x86_mmx_punpcklbw));
31630 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31631 for (unsigned i = 0; i != NumOps; i += 2)
31632 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31633 Ops[i], Ops[i + 1]);
31634 Ops.resize(NumOps / 2);
31640 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31641 TargetLowering::DAGCombinerInfo &DCI,
31642 const X86Subtarget &Subtarget) {
31643 SDValue N0 = N->getOperand(0);
31644 EVT VT = N->getValueType(0);
31645 EVT SrcVT = N0.getValueType();
31647 // Try to match patterns such as
31648 // (i16 bitcast (v16i1 x))
31650 // (i16 movmsk (16i8 sext (v16i1 x)))
31651 // before the setcc result is scalarized on subtargets that don't have legal
31653 if (DCI.isBeforeLegalize()) {
31654 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31657 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31658 // type, widen both sides to avoid a trip through memory.
31659 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31660 Subtarget.hasAVX512()) {
31662 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31663 N0 = DAG.getBitcast(MVT::v8i1, N0);
31664 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31665 DAG.getIntPtrConstant(0, dl));
31668 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31669 // type, widen both sides to avoid a trip through memory.
31670 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31671 Subtarget.hasAVX512()) {
31673 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31674 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31676 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31677 N0 = DAG.getBitcast(MVT::i8, N0);
31678 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31682 // Since MMX types are special and don't usually play with other vector types,
31683 // it's better to handle them early to be sure we emit efficient code by
31684 // avoiding store-load conversions.
31685 if (VT == MVT::x86mmx) {
31686 // Detect MMX constant vectors.
31688 SmallVector<APInt, 1> EltBits;
31689 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31691 // Handle zero-extension of i32 with MOVD.
31692 if (EltBits[0].countLeadingZeros() >= 32)
31693 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31694 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31695 // Else, bitcast to a double.
31696 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31697 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31698 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31701 // Detect bitcasts to x86mmx low word.
31702 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31703 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31704 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31705 bool LowUndef = true, AllUndefOrZero = true;
31706 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31707 SDValue Op = N0.getOperand(i);
31708 LowUndef &= Op.isUndef() || (i >= e/2);
31709 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31711 if (AllUndefOrZero) {
31712 SDValue N00 = N0.getOperand(0);
31714 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31715 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31716 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31720 // Detect bitcasts of 64-bit build vectors and convert to a
31721 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31723 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31724 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31725 SrcVT == MVT::v8i8))
31726 return createMMXBuildVector(N0, DAG, Subtarget);
31728 // Detect bitcasts between element or subvector extraction to x86mmx.
31729 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31730 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31731 isNullConstant(N0.getOperand(1))) {
31732 SDValue N00 = N0.getOperand(0);
31733 if (N00.getValueType().is128BitVector())
31734 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31735 DAG.getBitcast(MVT::v2i64, N00));
31738 // Detect bitcasts from FP_TO_SINT to x86mmx.
31739 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31741 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31742 DAG.getUNDEF(MVT::v2i32));
31743 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31744 DAG.getBitcast(MVT::v2i64, Res));
31748 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31749 // most of these to scalar anyway.
31750 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31751 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31752 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31753 return combinevXi1ConstantToInteger(N0, DAG);
31756 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
31757 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
31758 isa<ConstantSDNode>(N0)) {
31759 auto *C = cast<ConstantSDNode>(N0);
31760 if (C->isAllOnesValue())
31761 return DAG.getConstant(1, SDLoc(N0), VT);
31762 if (C->isNullValue())
31763 return DAG.getConstant(0, SDLoc(N0), VT);
31766 // Try to remove bitcasts from input and output of mask arithmetic to
31767 // remove GPR<->K-register crossings.
31768 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31771 // Convert a bitcasted integer logic operation that has one bitcasted
31772 // floating-point operand into a floating-point logic operation. This may
31773 // create a load of a constant, but that is cheaper than materializing the
31774 // constant in an integer register and transferring it to an SSE register or
31775 // transferring the SSE operand to integer register and back.
31777 switch (N0.getOpcode()) {
31778 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31779 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31780 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31781 default: return SDValue();
31784 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31785 (Subtarget.hasSSE2() && VT == MVT::f64)))
31788 SDValue LogicOp0 = N0.getOperand(0);
31789 SDValue LogicOp1 = N0.getOperand(1);
31792 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31793 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31794 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31795 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31796 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31797 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31799 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31800 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31801 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31802 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31803 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31804 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31810 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31811 // the elements of a vector.
31812 // Returns the vector that is being reduced on, or SDValue() if a reduction
31813 // was not matched.
31814 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31815 ArrayRef<ISD::NodeType> CandidateBinOps) {
31816 // The pattern must end in an extract from index 0.
31817 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31818 !isNullConstant(Extract->getOperand(1)))
31821 SDValue Op = Extract->getOperand(0);
31822 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31824 // Match against one of the candidate binary ops.
31825 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31826 return Op.getOpcode() == unsigned(BinOp);
31830 // At each stage, we're looking for something that looks like:
31831 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31832 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31833 // i32 undef, i32 undef, i32 undef, i32 undef>
31834 // %a = binop <8 x i32> %op, %s
31835 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31836 // we expect something like:
31837 // <4,5,6,7,u,u,u,u>
31838 // <2,3,u,u,u,u,u,u>
31839 // <1,u,u,u,u,u,u,u>
31840 unsigned CandidateBinOp = Op.getOpcode();
31841 for (unsigned i = 0; i < Stages; ++i) {
31842 if (Op.getOpcode() != CandidateBinOp)
31845 ShuffleVectorSDNode *Shuffle =
31846 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31848 Op = Op.getOperand(1);
31850 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31851 Op = Op.getOperand(0);
31854 // The first operand of the shuffle should be the same as the other operand
31856 if (!Shuffle || Shuffle->getOperand(0) != Op)
31859 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31860 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31861 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31865 BinOp = CandidateBinOp;
31869 // Given a select, detect the following pattern:
31870 // 1: %2 = zext <N x i8> %0 to <N x i32>
31871 // 2: %3 = zext <N x i8> %1 to <N x i32>
31872 // 3: %4 = sub nsw <N x i32> %2, %3
31873 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31874 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31875 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31876 // This is useful as it is the input into a SAD pattern.
31877 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31879 // Check the condition of the select instruction is greater-than.
31880 SDValue SetCC = Select->getOperand(0);
31881 if (SetCC.getOpcode() != ISD::SETCC)
31883 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31884 if (CC != ISD::SETGT && CC != ISD::SETLT)
31887 SDValue SelectOp1 = Select->getOperand(1);
31888 SDValue SelectOp2 = Select->getOperand(2);
31890 // The following instructions assume SelectOp1 is the subtraction operand
31891 // and SelectOp2 is the negation operand.
31892 // In the case of SETLT this is the other way around.
31893 if (CC == ISD::SETLT)
31894 std::swap(SelectOp1, SelectOp2);
31896 // The second operand of the select should be the negation of the first
31897 // operand, which is implemented as 0 - SelectOp1.
31898 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31899 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31900 SelectOp2.getOperand(1) == SelectOp1))
31903 // The first operand of SetCC is the first operand of the select, which is the
31904 // difference between the two input vectors.
31905 if (SetCC.getOperand(0) != SelectOp1)
31908 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31910 if ((CC == ISD::SETLT) &&
31911 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31912 SplatVal.isOneValue()) ||
31913 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31916 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31917 if ((CC == ISD::SETGT) &&
31918 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31919 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31922 // The first operand of the select is the difference between the two input
31924 if (SelectOp1.getOpcode() != ISD::SUB)
31927 Op0 = SelectOp1.getOperand(0);
31928 Op1 = SelectOp1.getOperand(1);
31930 // Check if the operands of the sub are zero-extended from vectors of i8.
31931 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31932 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31933 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31934 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31940 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31942 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31943 const SDValue &Zext1, const SDLoc &DL,
31944 const X86Subtarget &Subtarget) {
31945 // Find the appropriate width for the PSADBW.
31946 EVT InVT = Zext0.getOperand(0).getValueType();
31947 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31949 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31950 // fill in the missing vector elements with 0.
31951 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31952 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31953 Ops[0] = Zext0.getOperand(0);
31954 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31955 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31956 Ops[0] = Zext1.getOperand(0);
31957 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31959 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31960 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31961 ArrayRef<SDValue> Ops) {
31962 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31963 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31965 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31966 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31970 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31972 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31973 const X86Subtarget &Subtarget) {
31974 // Bail without SSE41.
31975 if (!Subtarget.hasSSE41())
31978 EVT ExtractVT = Extract->getValueType(0);
31979 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31982 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31984 SDValue Src = matchBinOpReduction(
31985 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31989 EVT SrcVT = Src.getValueType();
31990 EVT SrcSVT = SrcVT.getScalarType();
31991 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
31995 SDValue MinPos = Src;
31997 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
31998 while (SrcVT.getSizeInBits() > 128) {
31999 unsigned NumElts = SrcVT.getVectorNumElements();
32000 unsigned NumSubElts = NumElts / 2;
32001 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
32002 unsigned SubSizeInBits = SrcVT.getSizeInBits();
32003 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
32004 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
32005 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
32007 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
32008 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
32009 "Unexpected value type");
32011 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
32012 // to flip the value accordingly.
32014 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
32015 if (BinOp == ISD::SMAX)
32016 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
32017 else if (BinOp == ISD::SMIN)
32018 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
32019 else if (BinOp == ISD::UMAX)
32020 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
32023 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32025 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
32026 // shuffling each upper element down and insert zeros. This means that the
32027 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
32028 // ready for the PHMINPOS.
32029 if (ExtractVT == MVT::i8) {
32030 SDValue Upper = DAG.getVectorShuffle(
32031 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
32032 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
32033 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
32036 // Perform the PHMINPOS on a v8i16 vector,
32037 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
32038 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
32039 MinPos = DAG.getBitcast(SrcVT, MinPos);
32042 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32044 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
32045 DAG.getIntPtrConstant(0, DL));
32048 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
32049 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
32051 const X86Subtarget &Subtarget) {
32052 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
32053 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
32056 EVT ExtractVT = Extract->getValueType(0);
32057 unsigned BitWidth = ExtractVT.getSizeInBits();
32058 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
32059 ExtractVT != MVT::i8)
32062 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
32063 unsigned BinOp = 0;
32064 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
32068 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
32069 // which we can't support here for now.
32070 if (Match.getScalarValueSizeInBits() != BitWidth)
32073 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
32074 unsigned MatchSizeInBits = Match.getValueSizeInBits();
32075 if (!(MatchSizeInBits == 128 ||
32076 (MatchSizeInBits == 256 &&
32077 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
32080 // Don't bother performing this for 2-element vectors.
32081 if (Match.getValueType().getVectorNumElements() <= 2)
32084 // Check that we are extracting a reduction of all sign bits.
32085 if (DAG.ComputeNumSignBits(Match) != BitWidth)
32088 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
32090 if (64 == BitWidth || 32 == BitWidth)
32091 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
32092 MatchSizeInBits / BitWidth);
32094 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
32097 ISD::CondCode CondCode;
32098 if (BinOp == ISD::OR) {
32099 // any_of -> MOVMSK != 0
32100 CompareBits = APInt::getNullValue(32);
32101 CondCode = ISD::CondCode::SETNE;
32103 // all_of -> MOVMSK == ((1 << NumElts) - 1)
32104 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
32105 CondCode = ISD::CondCode::SETEQ;
32108 // Perform the select as i32/i64 and then truncate to avoid partial register
32110 unsigned ResWidth = std::max(BitWidth, 32u);
32111 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
32113 SDValue Zero = DAG.getConstant(0, DL, ResVT);
32114 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
32115 SDValue Res = DAG.getBitcast(MaskVT, Match);
32116 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
32117 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
32118 Ones, Zero, CondCode);
32119 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
32122 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
32123 const X86Subtarget &Subtarget) {
32124 // PSADBW is only supported on SSE2 and up.
32125 if (!Subtarget.hasSSE2())
32128 // Verify the type we're extracting from is any integer type above i16.
32129 EVT VT = Extract->getOperand(0).getValueType();
32130 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
32133 unsigned RegSize = 128;
32134 if (Subtarget.useBWIRegs())
32136 else if (Subtarget.hasAVX())
32139 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
32140 // TODO: We should be able to handle larger vectors by splitting them before
32141 // feeding them into several SADs, and then reducing over those.
32142 if (RegSize / VT.getVectorNumElements() < 8)
32145 // Match shuffle + add pyramid.
32146 unsigned BinOp = 0;
32147 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
32149 // The operand is expected to be zero extended from i8
32150 // (verified in detectZextAbsDiff).
32151 // In order to convert to i64 and above, additional any/zero/sign
32152 // extend is expected.
32153 // The zero extend from 32 bit has no mathematical effect on the result.
32154 // Also the sign extend is basically zero extend
32155 // (extends the sign bit which is zero).
32156 // So it is correct to skip the sign/zero extend instruction.
32157 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
32158 Root.getOpcode() == ISD::ZERO_EXTEND ||
32159 Root.getOpcode() == ISD::ANY_EXTEND))
32160 Root = Root.getOperand(0);
32162 // If there was a match, we want Root to be a select that is the root of an
32163 // abs-diff pattern.
32164 if (!Root || (Root.getOpcode() != ISD::VSELECT))
32167 // Check whether we have an abs-diff pattern feeding into the select.
32168 SDValue Zext0, Zext1;
32169 if (!detectZextAbsDiff(Root, Zext0, Zext1))
32172 // Create the SAD instruction.
32174 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
32176 // If the original vector was wider than 8 elements, sum over the results
32177 // in the SAD vector.
32178 unsigned Stages = Log2_32(VT.getVectorNumElements());
32179 MVT SadVT = SAD.getSimpleValueType();
32181 unsigned SadElems = SadVT.getVectorNumElements();
32183 for(unsigned i = Stages - 3; i > 0; --i) {
32184 SmallVector<int, 16> Mask(SadElems, -1);
32185 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
32186 Mask[j] = MaskEnd + j;
32189 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
32190 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
32194 MVT Type = Extract->getSimpleValueType(0);
32195 unsigned TypeSizeInBits = Type.getSizeInBits();
32196 // Return the lowest TypeSizeInBits bits.
32197 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
32198 SAD = DAG.getBitcast(ResVT, SAD);
32199 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
32200 Extract->getOperand(1));
32203 // Attempt to peek through a target shuffle and extract the scalar from the
32205 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
32206 TargetLowering::DAGCombinerInfo &DCI,
32207 const X86Subtarget &Subtarget) {
32208 if (DCI.isBeforeLegalizeOps())
32211 SDValue Src = N->getOperand(0);
32212 SDValue Idx = N->getOperand(1);
32214 EVT VT = N->getValueType(0);
32215 EVT SrcVT = Src.getValueType();
32216 EVT SrcSVT = SrcVT.getVectorElementType();
32217 unsigned NumSrcElts = SrcVT.getVectorNumElements();
32219 // Don't attempt this for boolean mask vectors or unknown extraction indices.
32220 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
32223 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
32224 if (X86ISD::VBROADCAST == Src.getOpcode() &&
32225 Src.getOperand(0).getValueType() == VT)
32226 return Src.getOperand(0);
32228 // Resolve the target shuffle inputs and mask.
32229 SmallVector<int, 16> Mask;
32230 SmallVector<SDValue, 2> Ops;
32231 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
32234 // Attempt to narrow/widen the shuffle mask to the correct size.
32235 if (Mask.size() != NumSrcElts) {
32236 if ((NumSrcElts % Mask.size()) == 0) {
32237 SmallVector<int, 16> ScaledMask;
32238 int Scale = NumSrcElts / Mask.size();
32239 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
32240 Mask = std::move(ScaledMask);
32241 } else if ((Mask.size() % NumSrcElts) == 0) {
32242 SmallVector<int, 16> WidenedMask;
32243 while (Mask.size() > NumSrcElts &&
32244 canWidenShuffleElements(Mask, WidenedMask))
32245 Mask = std::move(WidenedMask);
32246 // TODO - investigate support for wider shuffle masks with known upper
32247 // undef/zero elements for implicit zero-extension.
32251 // Check if narrowing/widening failed.
32252 if (Mask.size() != NumSrcElts)
32255 int SrcIdx = Mask[N->getConstantOperandVal(1)];
32258 // If the shuffle source element is undef/zero then we can just accept it.
32259 if (SrcIdx == SM_SentinelUndef)
32260 return DAG.getUNDEF(VT);
32262 if (SrcIdx == SM_SentinelZero)
32263 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
32264 : DAG.getConstant(0, dl, VT);
32266 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
32267 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
32268 SrcIdx = SrcIdx % Mask.size();
32270 // We can only extract other elements from 128-bit vectors and in certain
32271 // circumstances, depending on SSE-level.
32272 // TODO: Investigate using extract_subvector for larger vectors.
32273 // TODO: Investigate float/double extraction if it will be just stored.
32274 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
32275 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
32276 assert(SrcSVT == VT && "Unexpected extraction type");
32277 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
32278 DAG.getIntPtrConstant(SrcIdx, dl));
32281 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
32282 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
32283 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
32284 "Unexpected extraction type");
32285 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
32286 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
32287 DAG.getIntPtrConstant(SrcIdx, dl));
32288 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
32294 /// Detect vector gather/scatter index generation and convert it from being a
32295 /// bunch of shuffles and extracts into a somewhat faster sequence.
32296 /// For i686, the best sequence is apparently storing the value and loading
32297 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
32298 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
32299 TargetLowering::DAGCombinerInfo &DCI,
32300 const X86Subtarget &Subtarget) {
32301 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
32304 // TODO - Remove this once we can handle the implicit zero-extension of
32305 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
32306 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
32307 // combineBasicSADPattern.
32308 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
32311 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
32314 SDValue InputVector = N->getOperand(0);
32315 SDValue EltIdx = N->getOperand(1);
32317 EVT SrcVT = InputVector.getValueType();
32318 EVT VT = N->getValueType(0);
32319 SDLoc dl(InputVector);
32321 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
32322 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32323 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
32324 SDValue MMXSrc = InputVector.getOperand(0);
32326 // The bitcast source is a direct mmx result.
32327 if (MMXSrc.getValueType() == MVT::x86mmx)
32328 return DAG.getBitcast(VT, InputVector);
32331 // Detect mmx to i32 conversion through a v2i32 elt extract.
32332 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32333 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
32334 SDValue MMXSrc = InputVector.getOperand(0);
32336 // The bitcast source is a direct mmx result.
32337 if (MMXSrc.getValueType() == MVT::x86mmx)
32338 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
32341 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
32342 isa<ConstantSDNode>(EltIdx) &&
32343 isa<ConstantSDNode>(InputVector.getOperand(0))) {
32344 uint64_t ExtractedElt = N->getConstantOperandVal(1);
32345 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
32346 const APInt &InputValue = InputC->getAPIntValue();
32347 uint64_t Res = InputValue[ExtractedElt];
32348 return DAG.getConstant(Res, dl, MVT::i1);
32351 // Check whether this extract is the root of a sum of absolute differences
32352 // pattern. This has to be done here because we really want it to happen
32353 // pre-legalization,
32354 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
32357 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
32358 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
32361 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
32362 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
32368 /// If a vector select has an operand that is -1 or 0, try to simplify the
32369 /// select to a bitwise logic operation.
32370 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
32372 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
32373 TargetLowering::DAGCombinerInfo &DCI,
32374 const X86Subtarget &Subtarget) {
32375 SDValue Cond = N->getOperand(0);
32376 SDValue LHS = N->getOperand(1);
32377 SDValue RHS = N->getOperand(2);
32378 EVT VT = LHS.getValueType();
32379 EVT CondVT = Cond.getValueType();
32381 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32383 if (N->getOpcode() != ISD::VSELECT)
32386 assert(CondVT.isVector() && "Vector select expects a vector selector!");
32388 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
32389 // Check if the first operand is all zeros and Cond type is vXi1.
32390 // This situation only applies to avx512.
32391 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
32392 CondVT.getVectorElementType() == MVT::i1) {
32393 // Invert the cond to not(cond) : xor(op,allones)=not(op)
32394 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
32395 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
32396 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
32399 // To use the condition operand as a bitwise mask, it must have elements that
32400 // are the same size as the select elements. Ie, the condition operand must
32401 // have already been promoted from the IR select condition type <N x i1>.
32402 // Don't check if the types themselves are equal because that excludes
32403 // vector floating-point selects.
32404 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
32407 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
32408 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
32410 // Try to invert the condition if true value is not all 1s and false value is
32412 if (!TValIsAllOnes && !FValIsAllZeros &&
32413 // Check if the selector will be produced by CMPP*/PCMP*.
32414 Cond.getOpcode() == ISD::SETCC &&
32415 // Check if SETCC has already been promoted.
32416 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
32418 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
32420 if (TValIsAllZeros || FValIsAllOnes) {
32421 SDValue CC = Cond.getOperand(2);
32422 ISD::CondCode NewCC =
32423 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
32424 Cond.getOperand(0).getValueType().isInteger());
32425 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
32427 std::swap(LHS, RHS);
32428 TValIsAllOnes = FValIsAllOnes;
32429 FValIsAllZeros = TValIsAllZeros;
32433 // Cond value must be 'sign splat' to be converted to a logical op.
32434 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
32437 // vselect Cond, 111..., 000... -> Cond
32438 if (TValIsAllOnes && FValIsAllZeros)
32439 return DAG.getBitcast(VT, Cond);
32441 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
32444 // vselect Cond, 111..., X -> or Cond, X
32445 if (TValIsAllOnes) {
32446 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
32447 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
32448 return DAG.getBitcast(VT, Or);
32451 // vselect Cond, X, 000... -> and Cond, X
32452 if (FValIsAllZeros) {
32453 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
32454 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
32455 return DAG.getBitcast(VT, And);
32458 // vselect Cond, 000..., X -> andn Cond, X
32459 if (TValIsAllZeros) {
32460 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
32461 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
32462 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
32463 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
32464 return DAG.getBitcast(VT, AndN);
32470 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
32471 SDValue Cond = N->getOperand(0);
32472 SDValue LHS = N->getOperand(1);
32473 SDValue RHS = N->getOperand(2);
32476 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
32477 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
32478 if (!TrueC || !FalseC)
32481 // Don't do this for crazy integer types.
32482 EVT VT = N->getValueType(0);
32483 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32486 // We're going to use the condition bit in math or logic ops. We could allow
32487 // this with a wider condition value (post-legalization it becomes an i8),
32488 // but if nothing is creating selects that late, it doesn't matter.
32489 if (Cond.getValueType() != MVT::i1)
32492 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
32493 // 3, 5, or 9 with i32/i64, so those get transformed too.
32494 // TODO: For constants that overflow or do not differ by power-of-2 or small
32495 // multiplier, convert to 'and' + 'add'.
32496 const APInt &TrueVal = TrueC->getAPIntValue();
32497 const APInt &FalseVal = FalseC->getAPIntValue();
32499 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
32503 APInt AbsDiff = Diff.abs();
32504 if (AbsDiff.isPowerOf2() ||
32505 ((VT == MVT::i32 || VT == MVT::i64) &&
32506 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
32508 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
32509 // of the condition can usually be folded into a compare predicate, but even
32510 // without that, the sequence should be cheaper than a CMOV alternative.
32511 if (TrueVal.slt(FalseVal)) {
32512 Cond = DAG.getNOT(DL, Cond, MVT::i1);
32513 std::swap(TrueC, FalseC);
32516 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
32517 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
32519 // Multiply condition by the difference if non-one.
32520 if (!AbsDiff.isOneValue())
32521 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
32523 // Add the base if non-zero.
32524 if (!FalseC->isNullValue())
32525 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
32533 /// If this is a *dynamic* select (non-constant condition) and we can match
32534 /// this node with one of the variable blend instructions, restructure the
32535 /// condition so that blends can use the high (sign) bit of each element.
32536 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
32537 TargetLowering::DAGCombinerInfo &DCI,
32538 const X86Subtarget &Subtarget) {
32539 SDValue Cond = N->getOperand(0);
32540 if (N->getOpcode() != ISD::VSELECT ||
32541 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
32544 // Don't optimize before the condition has been transformed to a legal type
32545 // and don't ever optimize vector selects that map to AVX512 mask-registers.
32546 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32547 if (BitWidth < 8 || BitWidth > 64)
32550 // We can only handle the cases where VSELECT is directly legal on the
32551 // subtarget. We custom lower VSELECT nodes with constant conditions and
32552 // this makes it hard to see whether a dynamic VSELECT will correctly
32553 // lower, so we both check the operation's status and explicitly handle the
32554 // cases where a *dynamic* blend will fail even though a constant-condition
32555 // blend could be custom lowered.
32556 // FIXME: We should find a better way to handle this class of problems.
32557 // Potentially, we should combine constant-condition vselect nodes
32558 // pre-legalization into shuffles and not mark as many types as custom
32560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32561 EVT VT = N->getValueType(0);
32562 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32564 // FIXME: We don't support i16-element blends currently. We could and
32565 // should support them by making *all* the bits in the condition be set
32566 // rather than just the high bit and using an i8-element blend.
32567 if (VT.getVectorElementType() == MVT::i16)
32569 // Dynamic blending was only available from SSE4.1 onward.
32570 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32572 // Byte blends are only available in AVX2
32573 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32575 // There are no 512-bit blend instructions that use sign bits.
32576 if (VT.is512BitVector())
32579 // TODO: Add other opcodes eventually lowered into BLEND.
32580 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32582 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
32585 APInt DemandedMask(APInt::getSignMask(BitWidth));
32587 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32588 !DCI.isBeforeLegalizeOps());
32589 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
32592 // If we changed the computation somewhere in the DAG, this change will
32593 // affect all users of Cond. Update all the nodes so that we do not use
32594 // the generic VSELECT anymore. Otherwise, we may perform wrong
32595 // optimizations as we messed with the actual expectation for the vector
32597 for (SDNode *U : Cond->uses()) {
32598 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
32599 Cond, U->getOperand(1), U->getOperand(2));
32600 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32602 DCI.CommitTargetLoweringOpt(TLO);
32603 return SDValue(N, 0);
32606 /// Do target-specific dag combines on SELECT and VSELECT nodes.
32607 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
32608 TargetLowering::DAGCombinerInfo &DCI,
32609 const X86Subtarget &Subtarget) {
32611 SDValue Cond = N->getOperand(0);
32612 // Get the LHS/RHS of the select.
32613 SDValue LHS = N->getOperand(1);
32614 SDValue RHS = N->getOperand(2);
32615 EVT VT = LHS.getValueType();
32616 EVT CondVT = Cond.getValueType();
32617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32619 // Convert vselects with constant condition into shuffles.
32620 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
32621 DCI.isBeforeLegalizeOps()) {
32622 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
32623 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
32624 SDValue CondElt = Cond->getOperand(i);
32626 // Arbitrarily choose from the 2nd operand if the select condition element
32628 // TODO: Can we do better by matching patterns such as even/odd?
32629 if (CondElt.isUndef() || isNullConstant(CondElt))
32633 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
32636 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
32637 // instructions match the semantics of the common C idiom x<y?x:y but not
32638 // x<=y?x:y, because of how they handle negative zero (which can be
32639 // ignored in unsafe-math mode).
32640 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
32641 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
32642 VT != MVT::f80 && VT != MVT::f128 &&
32643 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
32644 (Subtarget.hasSSE2() ||
32645 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
32646 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32648 unsigned Opcode = 0;
32649 // Check for x CC y ? x : y.
32650 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32651 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32655 // Converting this to a min would handle NaNs incorrectly, and swapping
32656 // the operands would cause it to handle comparisons between positive
32657 // and negative zero incorrectly.
32658 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32659 if (!DAG.getTarget().Options.UnsafeFPMath &&
32660 !(DAG.isKnownNeverZeroFloat(LHS) ||
32661 DAG.isKnownNeverZeroFloat(RHS)))
32663 std::swap(LHS, RHS);
32665 Opcode = X86ISD::FMIN;
32668 // Converting this to a min would handle comparisons between positive
32669 // and negative zero incorrectly.
32670 if (!DAG.getTarget().Options.UnsafeFPMath &&
32671 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32673 Opcode = X86ISD::FMIN;
32676 // Converting this to a min would handle both negative zeros and NaNs
32677 // incorrectly, but we can swap the operands to fix both.
32678 std::swap(LHS, RHS);
32683 Opcode = X86ISD::FMIN;
32687 // Converting this to a max would handle comparisons between positive
32688 // and negative zero incorrectly.
32689 if (!DAG.getTarget().Options.UnsafeFPMath &&
32690 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32692 Opcode = X86ISD::FMAX;
32695 // Converting this to a max would handle NaNs incorrectly, and swapping
32696 // the operands would cause it to handle comparisons between positive
32697 // and negative zero incorrectly.
32698 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32699 if (!DAG.getTarget().Options.UnsafeFPMath &&
32700 !(DAG.isKnownNeverZeroFloat(LHS) ||
32701 DAG.isKnownNeverZeroFloat(RHS)))
32703 std::swap(LHS, RHS);
32705 Opcode = X86ISD::FMAX;
32708 // Converting this to a max would handle both negative zeros and NaNs
32709 // incorrectly, but we can swap the operands to fix both.
32710 std::swap(LHS, RHS);
32715 Opcode = X86ISD::FMAX;
32718 // Check for x CC y ? y : x -- a min/max with reversed arms.
32719 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32720 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32724 // Converting this to a min would handle comparisons between positive
32725 // and negative zero incorrectly, and swapping the operands would
32726 // cause it to handle NaNs incorrectly.
32727 if (!DAG.getTarget().Options.UnsafeFPMath &&
32728 !(DAG.isKnownNeverZeroFloat(LHS) ||
32729 DAG.isKnownNeverZeroFloat(RHS))) {
32730 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32732 std::swap(LHS, RHS);
32734 Opcode = X86ISD::FMIN;
32737 // Converting this to a min would handle NaNs incorrectly.
32738 if (!DAG.getTarget().Options.UnsafeFPMath &&
32739 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32741 Opcode = X86ISD::FMIN;
32744 // Converting this to a min would handle both negative zeros and NaNs
32745 // incorrectly, but we can swap the operands to fix both.
32746 std::swap(LHS, RHS);
32751 Opcode = X86ISD::FMIN;
32755 // Converting this to a max would handle NaNs incorrectly.
32756 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32758 Opcode = X86ISD::FMAX;
32761 // Converting this to a max would handle comparisons between positive
32762 // and negative zero incorrectly, and swapping the operands would
32763 // cause it to handle NaNs incorrectly.
32764 if (!DAG.getTarget().Options.UnsafeFPMath &&
32765 !DAG.isKnownNeverZeroFloat(LHS) &&
32766 !DAG.isKnownNeverZeroFloat(RHS)) {
32767 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32769 std::swap(LHS, RHS);
32771 Opcode = X86ISD::FMAX;
32774 // Converting this to a max would handle both negative zeros and NaNs
32775 // incorrectly, but we can swap the operands to fix both.
32776 std::swap(LHS, RHS);
32781 Opcode = X86ISD::FMAX;
32787 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32790 // Some mask scalar intrinsics rely on checking if only one bit is set
32791 // and implement it in C code like this:
32792 // A[0] = (U & 1) ? A[0] : W[0];
32793 // This creates some redundant instructions that break pattern matching.
32794 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
32795 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
32796 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
32797 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32798 SDValue AndNode = Cond.getOperand(0);
32799 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
32800 isNullConstant(Cond.getOperand(1)) &&
32801 isOneConstant(AndNode.getOperand(1))) {
32802 // LHS and RHS swapped due to
32803 // setcc outputting 1 when AND resulted in 0 and vice versa.
32804 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
32805 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
32809 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32810 // lowering on KNL. In this case we convert it to
32811 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32812 // The same situation all vectors of i8 and i16 without BWI.
32813 // Make sure we extend these even before type legalization gets a chance to
32814 // split wide vectors.
32815 // Since SKX these selects have a proper lowering.
32816 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32817 CondVT.getVectorElementType() == MVT::i1 &&
32818 VT.getVectorNumElements() > 4 &&
32819 (VT.getVectorElementType() == MVT::i8 ||
32820 VT.getVectorElementType() == MVT::i16)) {
32821 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32822 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32825 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32828 // Canonicalize max and min:
32829 // (x > y) ? x : y -> (x >= y) ? x : y
32830 // (x < y) ? x : y -> (x <= y) ? x : y
32831 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32832 // the need for an extra compare
32833 // against zero. e.g.
32834 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32836 // testl %edi, %edi
32838 // cmovgl %edi, %eax
32842 // cmovsl %eax, %edi
32843 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32844 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32845 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32846 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32851 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32852 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32853 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32854 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32859 // Early exit check
32860 if (!TLI.isTypeLegal(VT))
32863 // Match VSELECTs into subs with unsigned saturation.
32864 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32865 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32866 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32867 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32868 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32870 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32871 // left side invert the predicate to simplify logic below.
32873 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32875 CC = ISD::getSetCCInverse(CC, true);
32876 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32880 if (Other.getNode() && Other->getNumOperands() == 2 &&
32881 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32882 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32883 SDValue CondRHS = Cond->getOperand(1);
32885 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32886 ArrayRef<SDValue> Ops) {
32887 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32890 // Look for a general sub with unsigned saturation first.
32891 // x >= y ? x-y : 0 --> subus x, y
32892 // x > y ? x-y : 0 --> subus x, y
32893 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32894 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32895 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32898 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32899 if (isa<BuildVectorSDNode>(CondRHS)) {
32900 // If the RHS is a constant we have to reverse the const
32901 // canonicalization.
32902 // x > C-1 ? x+-C : 0 --> subus x, C
32903 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32904 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32906 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32907 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32908 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32909 DAG.getConstant(0, DL, VT), OpRHS);
32910 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32914 // Another special case: If C was a sign bit, the sub has been
32915 // canonicalized into a xor.
32916 // FIXME: Would it be better to use computeKnownBits to determine
32917 // whether it's safe to decanonicalize the xor?
32918 // x s< 0 ? x^C : 0 --> subus x, C
32919 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32920 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32921 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32922 OpRHSConst->getAPIntValue().isSignMask()) {
32923 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32924 // Note that we have to rebuild the RHS constant here to ensure we
32925 // don't rely on particular values of undef lanes.
32926 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32933 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32936 if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
32939 // Custom action for SELECT MMX
32940 if (VT == MVT::x86mmx) {
32941 LHS = DAG.getBitcast(MVT::i64, LHS);
32942 RHS = DAG.getBitcast(MVT::i64, RHS);
32943 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32944 return DAG.getBitcast(VT, newSelect);
32951 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32953 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32954 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32955 /// Note that this is only legal for some op/cc combinations.
32956 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32958 const X86Subtarget &Subtarget) {
32959 // This combine only operates on CMP-like nodes.
32960 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32961 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32964 // Can't replace the cmp if it has more uses than the one we're looking at.
32965 // FIXME: We would like to be able to handle this, but would need to make sure
32966 // all uses were updated.
32967 if (!Cmp.hasOneUse())
32970 // This only applies to variations of the common case:
32971 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32972 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32973 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32974 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32975 // Using the proper condcodes (see below), overflow is checked for.
32977 // FIXME: We can generalize both constraints:
32978 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32980 // if the result is compared.
32982 SDValue CmpLHS = Cmp.getOperand(0);
32983 SDValue CmpRHS = Cmp.getOperand(1);
32985 if (!CmpLHS.hasOneUse())
32988 unsigned Opc = CmpLHS.getOpcode();
32989 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32992 SDValue OpRHS = CmpLHS.getOperand(2);
32993 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
32997 APInt Addend = OpRHSC->getAPIntValue();
32998 if (Opc == ISD::ATOMIC_LOAD_SUB)
33001 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
33005 APInt Comparison = CmpRHSC->getAPIntValue();
33007 // If the addend is the negation of the comparison value, then we can do
33008 // a full comparison by emitting the atomic arithmetic as a locked sub.
33009 if (Comparison == -Addend) {
33010 // The CC is fine, but we need to rewrite the LHS of the comparison as an
33012 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
33013 auto AtomicSub = DAG.getAtomic(
33014 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
33015 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
33016 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
33017 AN->getMemOperand());
33018 // If the comparision uses the CF flag we can't use INC/DEC instructions.
33019 bool NeedCF = false;
33022 case X86::COND_A: case X86::COND_AE:
33023 case X86::COND_B: case X86::COND_BE:
33027 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
33028 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33029 DAG.getUNDEF(CmpLHS.getValueType()));
33030 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33034 // We can handle comparisons with zero in a number of cases by manipulating
33036 if (!Comparison.isNullValue())
33039 if (CC == X86::COND_S && Addend == 1)
33041 else if (CC == X86::COND_NS && Addend == 1)
33043 else if (CC == X86::COND_G && Addend == -1)
33045 else if (CC == X86::COND_LE && Addend == -1)
33050 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
33051 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33052 DAG.getUNDEF(CmpLHS.getValueType()));
33053 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33057 // Check whether a boolean test is testing a boolean value generated by
33058 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
33061 // Simplify the following patterns:
33062 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
33063 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
33064 // to (Op EFLAGS Cond)
33066 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
33067 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
33068 // to (Op EFLAGS !Cond)
33070 // where Op could be BRCOND or CMOV.
33072 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
33073 // This combine only operates on CMP-like nodes.
33074 if (!(Cmp.getOpcode() == X86ISD::CMP ||
33075 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
33078 // Quit if not used as a boolean value.
33079 if (CC != X86::COND_E && CC != X86::COND_NE)
33082 // Check CMP operands. One of them should be 0 or 1 and the other should be
33083 // an SetCC or extended from it.
33084 SDValue Op1 = Cmp.getOperand(0);
33085 SDValue Op2 = Cmp.getOperand(1);
33088 const ConstantSDNode* C = nullptr;
33089 bool needOppositeCond = (CC == X86::COND_E);
33090 bool checkAgainstTrue = false; // Is it a comparison against 1?
33092 if ((C = dyn_cast<ConstantSDNode>(Op1)))
33094 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
33096 else // Quit if all operands are not constants.
33099 if (C->getZExtValue() == 1) {
33100 needOppositeCond = !needOppositeCond;
33101 checkAgainstTrue = true;
33102 } else if (C->getZExtValue() != 0)
33103 // Quit if the constant is neither 0 or 1.
33106 bool truncatedToBoolWithAnd = false;
33107 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
33108 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
33109 SetCC.getOpcode() == ISD::TRUNCATE ||
33110 SetCC.getOpcode() == ISD::AND) {
33111 if (SetCC.getOpcode() == ISD::AND) {
33113 if (isOneConstant(SetCC.getOperand(0)))
33115 if (isOneConstant(SetCC.getOperand(1)))
33119 SetCC = SetCC.getOperand(OpIdx);
33120 truncatedToBoolWithAnd = true;
33122 SetCC = SetCC.getOperand(0);
33125 switch (SetCC.getOpcode()) {
33126 case X86ISD::SETCC_CARRY:
33127 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
33128 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
33129 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
33130 // truncated to i1 using 'and'.
33131 if (checkAgainstTrue && !truncatedToBoolWithAnd)
33133 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
33134 "Invalid use of SETCC_CARRY!");
33136 case X86ISD::SETCC:
33137 // Set the condition code or opposite one if necessary.
33138 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
33139 if (needOppositeCond)
33140 CC = X86::GetOppositeBranchCondition(CC);
33141 return SetCC.getOperand(1);
33142 case X86ISD::CMOV: {
33143 // Check whether false/true value has canonical one, i.e. 0 or 1.
33144 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
33145 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
33146 // Quit if true value is not a constant.
33149 // Quit if false value is not a constant.
33151 SDValue Op = SetCC.getOperand(0);
33152 // Skip 'zext' or 'trunc' node.
33153 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
33154 Op.getOpcode() == ISD::TRUNCATE)
33155 Op = Op.getOperand(0);
33156 // A special case for rdrand/rdseed, where 0 is set if false cond is
33158 if ((Op.getOpcode() != X86ISD::RDRAND &&
33159 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
33162 // Quit if false value is not the constant 0 or 1.
33163 bool FValIsFalse = true;
33164 if (FVal && FVal->getZExtValue() != 0) {
33165 if (FVal->getZExtValue() != 1)
33167 // If FVal is 1, opposite cond is needed.
33168 needOppositeCond = !needOppositeCond;
33169 FValIsFalse = false;
33171 // Quit if TVal is not the constant opposite of FVal.
33172 if (FValIsFalse && TVal->getZExtValue() != 1)
33174 if (!FValIsFalse && TVal->getZExtValue() != 0)
33176 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
33177 if (needOppositeCond)
33178 CC = X86::GetOppositeBranchCondition(CC);
33179 return SetCC.getOperand(3);
33186 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
33188 /// (X86or (X86setcc) (X86setcc))
33189 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
33190 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
33191 X86::CondCode &CC1, SDValue &Flags,
33193 if (Cond->getOpcode() == X86ISD::CMP) {
33194 if (!isNullConstant(Cond->getOperand(1)))
33197 Cond = Cond->getOperand(0);
33202 SDValue SetCC0, SetCC1;
33203 switch (Cond->getOpcode()) {
33204 default: return false;
33211 SetCC0 = Cond->getOperand(0);
33212 SetCC1 = Cond->getOperand(1);
33216 // Make sure we have SETCC nodes, using the same flags value.
33217 if (SetCC0.getOpcode() != X86ISD::SETCC ||
33218 SetCC1.getOpcode() != X86ISD::SETCC ||
33219 SetCC0->getOperand(1) != SetCC1->getOperand(1))
33222 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
33223 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
33224 Flags = SetCC0->getOperand(1);
33228 // When legalizing carry, we create carries via add X, -1
33229 // If that comes from an actual carry, via setcc, we use the
33231 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
33232 if (EFLAGS.getOpcode() == X86ISD::ADD) {
33233 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
33234 SDValue Carry = EFLAGS.getOperand(0);
33235 while (Carry.getOpcode() == ISD::TRUNCATE ||
33236 Carry.getOpcode() == ISD::ZERO_EXTEND ||
33237 Carry.getOpcode() == ISD::SIGN_EXTEND ||
33238 Carry.getOpcode() == ISD::ANY_EXTEND ||
33239 (Carry.getOpcode() == ISD::AND &&
33240 isOneConstant(Carry.getOperand(1))))
33241 Carry = Carry.getOperand(0);
33242 if (Carry.getOpcode() == X86ISD::SETCC ||
33243 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
33244 if (Carry.getConstantOperandVal(0) == X86::COND_B)
33245 return Carry.getOperand(1);
33253 /// Optimize an EFLAGS definition used according to the condition code \p CC
33254 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
33255 /// uses of chain values.
33256 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
33258 const X86Subtarget &Subtarget) {
33259 if (CC == X86::COND_B)
33260 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
33263 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
33265 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
33268 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
33269 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
33270 TargetLowering::DAGCombinerInfo &DCI,
33271 const X86Subtarget &Subtarget) {
33274 SDValue FalseOp = N->getOperand(0);
33275 SDValue TrueOp = N->getOperand(1);
33276 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
33277 SDValue Cond = N->getOperand(3);
33279 // Try to simplify the EFLAGS and condition code operands.
33280 // We can't always do this as FCMOV only supports a subset of X86 cond.
33281 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
33282 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
33283 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
33285 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33289 // If this is a select between two integer constants, try to do some
33290 // optimizations. Note that the operands are ordered the opposite of SELECT
33292 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
33293 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
33294 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
33295 // larger than FalseC (the false value).
33296 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
33297 CC = X86::GetOppositeBranchCondition(CC);
33298 std::swap(TrueC, FalseC);
33299 std::swap(TrueOp, FalseOp);
33302 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
33303 // This is efficient for any integer data type (including i8/i16) and
33305 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
33306 Cond = getSETCC(CC, Cond, DL, DAG);
33308 // Zero extend the condition if needed.
33309 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
33311 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
33312 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
33313 DAG.getConstant(ShAmt, DL, MVT::i8));
33317 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
33318 // for any integer data type, including i8/i16.
33319 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
33320 Cond = getSETCC(CC, Cond, DL, DAG);
33322 // Zero extend the condition if needed.
33323 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
33324 FalseC->getValueType(0), Cond);
33325 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33326 SDValue(FalseC, 0));
33330 // Optimize cases that will turn into an LEA instruction. This requires
33331 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
33332 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
33333 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
33334 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
33336 bool isFastMultiplier = false;
33338 switch ((unsigned char)Diff) {
33340 case 1: // result = add base, cond
33341 case 2: // result = lea base( , cond*2)
33342 case 3: // result = lea base(cond, cond*2)
33343 case 4: // result = lea base( , cond*4)
33344 case 5: // result = lea base(cond, cond*4)
33345 case 8: // result = lea base( , cond*8)
33346 case 9: // result = lea base(cond, cond*8)
33347 isFastMultiplier = true;
33352 if (isFastMultiplier) {
33353 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
33354 Cond = getSETCC(CC, Cond, DL ,DAG);
33355 // Zero extend the condition if needed.
33356 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
33358 // Scale the condition by the difference.
33360 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
33361 DAG.getConstant(Diff, DL, Cond.getValueType()));
33363 // Add the base if non-zero.
33364 if (FalseC->getAPIntValue() != 0)
33365 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33366 SDValue(FalseC, 0));
33373 // Handle these cases:
33374 // (select (x != c), e, c) -> select (x != c), e, x),
33375 // (select (x == c), c, e) -> select (x == c), x, e)
33376 // where the c is an integer constant, and the "select" is the combination
33377 // of CMOV and CMP.
33379 // The rationale for this change is that the conditional-move from a constant
33380 // needs two instructions, however, conditional-move from a register needs
33381 // only one instruction.
33383 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
33384 // some instruction-combining opportunities. This opt needs to be
33385 // postponed as late as possible.
33387 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
33388 // the DCI.xxxx conditions are provided to postpone the optimization as
33389 // late as possible.
33391 ConstantSDNode *CmpAgainst = nullptr;
33392 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
33393 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
33394 !isa<ConstantSDNode>(Cond.getOperand(0))) {
33396 if (CC == X86::COND_NE &&
33397 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
33398 CC = X86::GetOppositeBranchCondition(CC);
33399 std::swap(TrueOp, FalseOp);
33402 if (CC == X86::COND_E &&
33403 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
33404 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
33405 DAG.getConstant(CC, DL, MVT::i8), Cond };
33406 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33411 // Fold and/or of setcc's to double CMOV:
33412 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
33413 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
33415 // This combine lets us generate:
33416 // cmovcc1 (jcc1 if we don't have CMOV)
33422 // cmovne (jne if we don't have CMOV)
33423 // When we can't use the CMOV instruction, it might increase branch
33425 // When we can use CMOV, or when there is no mispredict, this improves
33426 // throughput and reduces register pressure.
33428 if (CC == X86::COND_NE) {
33430 X86::CondCode CC0, CC1;
33432 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
33434 std::swap(FalseOp, TrueOp);
33435 CC0 = X86::GetOppositeBranchCondition(CC0);
33436 CC1 = X86::GetOppositeBranchCondition(CC1);
33439 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
33441 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
33442 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
33443 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33448 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
33449 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
33450 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
33451 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
33452 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
33453 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
33454 SDValue Add = TrueOp;
33455 SDValue Const = FalseOp;
33456 // Canonicalize the condition code for easier matching and output.
33457 if (CC == X86::COND_E) {
33458 std::swap(Add, Const);
33462 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
33463 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
33464 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
33465 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
33466 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
33467 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
33468 EVT VT = N->getValueType(0);
33469 // This should constant fold.
33470 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
33471 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
33472 DAG.getConstant(CC, DL, MVT::i8), Cond);
33473 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
33480 /// Different mul shrinking modes.
33481 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
33483 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
33484 EVT VT = N->getOperand(0).getValueType();
33485 if (VT.getScalarSizeInBits() != 32)
33488 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
33489 unsigned SignBits[2] = {1, 1};
33490 bool IsPositive[2] = {false, false};
33491 for (unsigned i = 0; i < 2; i++) {
33492 SDValue Opd = N->getOperand(i);
33494 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
33495 // compute signbits for it separately.
33496 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
33497 // For anyextend, it is safe to assume an appropriate number of leading
33499 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
33501 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
33506 IsPositive[i] = true;
33507 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
33508 // All the operands of BUILD_VECTOR need to be int constant.
33509 // Find the smallest value range which all the operands belong to.
33511 IsPositive[i] = true;
33512 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
33513 if (SubOp.isUndef())
33515 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
33518 APInt IntVal = CN->getAPIntValue();
33519 if (IntVal.isNegative())
33520 IsPositive[i] = false;
33521 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
33524 SignBits[i] = DAG.ComputeNumSignBits(Opd);
33525 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
33526 IsPositive[i] = true;
33530 bool AllPositive = IsPositive[0] && IsPositive[1];
33531 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
33532 // When ranges are from -128 ~ 127, use MULS8 mode.
33533 if (MinSignBits >= 25)
33535 // When ranges are from 0 ~ 255, use MULU8 mode.
33536 else if (AllPositive && MinSignBits >= 24)
33538 // When ranges are from -32768 ~ 32767, use MULS16 mode.
33539 else if (MinSignBits >= 17)
33541 // When ranges are from 0 ~ 65535, use MULU16 mode.
33542 else if (AllPositive && MinSignBits >= 16)
33549 /// When the operands of vector mul are extended from smaller size values,
33550 /// like i8 and i16, the type of mul may be shrinked to generate more
33551 /// efficient code. Two typical patterns are handled:
33553 /// %2 = sext/zext <N x i8> %1 to <N x i32>
33554 /// %4 = sext/zext <N x i8> %3 to <N x i32>
33555 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33556 /// %5 = mul <N x i32> %2, %4
33559 /// %2 = zext/sext <N x i16> %1 to <N x i32>
33560 /// %4 = zext/sext <N x i16> %3 to <N x i32>
33561 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33562 /// %5 = mul <N x i32> %2, %4
33564 /// There are four mul shrinking modes:
33565 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
33566 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
33567 /// generate pmullw+sext32 for it (MULS8 mode).
33568 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
33569 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
33570 /// generate pmullw+zext32 for it (MULU8 mode).
33571 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
33572 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
33573 /// generate pmullw+pmulhw for it (MULS16 mode).
33574 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
33575 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
33576 /// generate pmullw+pmulhuw for it (MULU16 mode).
33577 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
33578 const X86Subtarget &Subtarget) {
33579 // Check for legality
33580 // pmullw/pmulhw are not supported by SSE.
33581 if (!Subtarget.hasSSE2())
33584 // Check for profitability
33585 // pmulld is supported since SSE41. It is better to use pmulld
33586 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
33588 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
33589 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
33593 if (!canReduceVMulWidth(N, DAG, Mode))
33597 SDValue N0 = N->getOperand(0);
33598 SDValue N1 = N->getOperand(1);
33599 EVT VT = N->getOperand(0).getValueType();
33600 unsigned NumElts = VT.getVectorNumElements();
33601 if ((NumElts % 2) != 0)
33604 unsigned RegSize = 128;
33605 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
33606 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
33608 // Shrink the operands of mul.
33609 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
33610 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
33612 if (NumElts >= OpsVT.getVectorNumElements()) {
33613 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
33614 // lower part is needed.
33615 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
33616 if (Mode == MULU8 || Mode == MULS8) {
33617 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
33620 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
33621 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
33622 // the higher part is also needed.
33623 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33624 ReducedVT, NewN0, NewN1);
33626 // Repack the lower part and higher part result of mul into a wider
33628 // Generate shuffle functioning as punpcklwd.
33629 SmallVector<int, 16> ShuffleMask(NumElts);
33630 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33631 ShuffleMask[2 * i] = i;
33632 ShuffleMask[2 * i + 1] = i + NumElts;
33635 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33636 ResLo = DAG.getBitcast(ResVT, ResLo);
33637 // Generate shuffle functioning as punpckhwd.
33638 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33639 ShuffleMask[2 * i] = i + NumElts / 2;
33640 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33643 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33644 ResHi = DAG.getBitcast(ResVT, ResHi);
33645 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33648 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33649 // to legalize the mul explicitly because implicit legalization for type
33650 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33651 // instructions which will not exist when we explicitly legalize it by
33652 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33653 // <4 x i16> undef).
33655 // Legalize the operands of mul.
33656 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33657 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33658 if ((RegSize % ReducedSizeInBits) != 0)
33661 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33662 DAG.getUNDEF(ReducedVT));
33664 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33666 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33668 if (Mode == MULU8 || Mode == MULS8) {
33669 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33671 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33673 // convert the type of mul result to VT.
33674 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33675 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33676 : ISD::SIGN_EXTEND_VECTOR_INREG,
33678 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33679 DAG.getIntPtrConstant(0, DL));
33681 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33682 // MULU16/MULS16, both parts are needed.
33683 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33684 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33685 OpsVT, NewN0, NewN1);
33687 // Repack the lower part and higher part result of mul into a wider
33688 // result. Make sure the type of mul result is VT.
33689 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33690 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33691 Res = DAG.getBitcast(ResVT, Res);
33692 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33693 DAG.getIntPtrConstant(0, DL));
33698 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33699 EVT VT, const SDLoc &DL) {
33701 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33702 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33703 DAG.getConstant(Mult, DL, VT));
33704 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33705 DAG.getConstant(Shift, DL, MVT::i8));
33706 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33711 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
33712 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33713 DAG.getConstant(Mul1, DL, VT));
33714 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
33715 DAG.getConstant(Mul2, DL, VT));
33716 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33725 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33726 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33728 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33729 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33731 // mul x, 41 => add ((shl (mul x, 5), 3), x)
33732 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
33734 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33735 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33736 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33738 // mul x, 19 => add ((shl (mul x, 9), 1), x)
33739 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
33741 // mul x, 37 => add ((shl (mul x, 9), 2), x)
33742 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
33744 // mul x, 73 => add ((shl (mul x, 9), 3), x)
33745 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
33747 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33748 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33750 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
33751 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33753 // mul x, 26 => add ((mul (mul x, 5), 5), x)
33754 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
33756 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33757 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
33759 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33760 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33761 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
33764 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
33765 // by a single LEA.
33766 // First check if this a sum of two power of 2s because that's easy. Then
33767 // count how many zeros are up to the first bit.
33768 // TODO: We can do this even without LEA at a cost of two shifts and an add.
33769 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
33770 unsigned ScaleShift = countTrailingZeros(MulAmt);
33771 if (ScaleShift >= 1 && ScaleShift < 4) {
33772 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
33773 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33774 DAG.getConstant(ShiftAmt, DL, MVT::i8));
33775 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33776 DAG.getConstant(ScaleShift, DL, MVT::i8));
33777 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
33784 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33785 // which is always at least as quick as PMULLD, expect on KNL.
33786 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33787 const X86Subtarget &Subtarget) {
33788 if (!Subtarget.hasSSE2())
33791 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33794 EVT VT = N->getValueType(0);
33796 // Only support vXi32 vectors.
33797 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33800 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33801 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33802 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33805 SDValue N0 = N->getOperand(0);
33806 SDValue N1 = N->getOperand(1);
33807 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33808 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33809 !DAG.MaskedValueIsZero(N0, Mask17))
33812 // Use SplitOpsAndApply to handle AVX splitting.
33813 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33814 ArrayRef<SDValue> Ops) {
33815 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33816 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33818 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33819 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33823 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
33824 const X86Subtarget &Subtarget) {
33825 if (!Subtarget.hasSSE2())
33828 EVT VT = N->getValueType(0);
33830 // Only support vXi64 vectors.
33831 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
33832 !DAG.getTargetLoweringInfo().isTypeLegal(VT))
33835 SDValue N0 = N->getOperand(0);
33836 SDValue N1 = N->getOperand(1);
33838 // MULDQ returns the 64-bit result of the signed multiplication of the lower
33839 // 32-bits. We can lower with this if the sign bits stretch that far.
33840 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
33841 DAG.ComputeNumSignBits(N1) > 32) {
33842 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33843 ArrayRef<SDValue> Ops) {
33844 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
33846 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33847 PMULDQBuilder, /*CheckBWI*/false);
33850 // If the upper bits are zero we can use a single pmuludq.
33851 APInt Mask = APInt::getHighBitsSet(64, 32);
33852 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
33853 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33854 ArrayRef<SDValue> Ops) {
33855 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
33857 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33858 PMULUDQBuilder, /*CheckBWI*/false);
33864 /// Optimize a single multiply with constant into two operations in order to
33865 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33866 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33867 TargetLowering::DAGCombinerInfo &DCI,
33868 const X86Subtarget &Subtarget) {
33869 EVT VT = N->getValueType(0);
33871 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33874 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
33877 if (DCI.isBeforeLegalize() && VT.isVector())
33878 return reduceVMULWidth(N, DAG, Subtarget);
33880 if (!MulConstantOptimization)
33882 // An imul is usually smaller than the alternative sequence.
33883 if (DAG.getMachineFunction().getFunction().optForMinSize())
33886 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33889 if (VT != MVT::i64 && VT != MVT::i32)
33892 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33895 if (isPowerOf2_64(C->getZExtValue()))
33898 int64_t SignMulAmt = C->getSExtValue();
33899 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
33900 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
33903 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
33904 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33905 DAG.getConstant(AbsMulAmt, DL, VT));
33906 if (SignMulAmt < 0)
33907 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
33913 uint64_t MulAmt1 = 0;
33914 uint64_t MulAmt2 = 0;
33915 if ((AbsMulAmt % 9) == 0) {
33917 MulAmt2 = AbsMulAmt / 9;
33918 } else if ((AbsMulAmt % 5) == 0) {
33920 MulAmt2 = AbsMulAmt / 5;
33921 } else if ((AbsMulAmt % 3) == 0) {
33923 MulAmt2 = AbsMulAmt / 3;
33927 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
33929 (isPowerOf2_64(MulAmt2) ||
33930 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
33932 if (isPowerOf2_64(MulAmt2) &&
33933 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33934 // If second multiplifer is pow2, issue it first. We want the multiply by
33935 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33937 std::swap(MulAmt1, MulAmt2);
33939 if (isPowerOf2_64(MulAmt1))
33940 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33941 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33943 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33944 DAG.getConstant(MulAmt1, DL, VT));
33946 if (isPowerOf2_64(MulAmt2))
33947 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33948 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33950 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33951 DAG.getConstant(MulAmt2, DL, VT));
33953 // Negate the result.
33954 if (SignMulAmt < 0)
33955 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
33957 } else if (!Subtarget.slowLEA())
33958 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
33961 assert(C->getZExtValue() != 0 &&
33962 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33963 "Both cases that could cause potential overflows should have "
33964 "already been handled.");
33965 if (isPowerOf2_64(AbsMulAmt - 1)) {
33966 // (mul x, 2^N + 1) => (add (shl x, N), x)
33967 NewMul = DAG.getNode(
33968 ISD::ADD, DL, VT, N->getOperand(0),
33969 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33970 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
33972 // To negate, subtract the number from zero
33973 if (SignMulAmt < 0)
33974 NewMul = DAG.getNode(ISD::SUB, DL, VT,
33975 DAG.getConstant(0, DL, VT), NewMul);
33976 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
33977 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33978 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33979 DAG.getConstant(Log2_64(AbsMulAmt + 1),
33981 // To negate, reverse the operands of the subtract.
33982 if (SignMulAmt < 0)
33983 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
33985 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33986 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
33987 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
33988 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33989 DAG.getConstant(Log2_64(AbsMulAmt - 2),
33991 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33992 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33993 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
33994 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
33995 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33996 DAG.getConstant(Log2_64(AbsMulAmt + 2),
33998 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33999 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
34006 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
34007 SDValue N0 = N->getOperand(0);
34008 SDValue N1 = N->getOperand(1);
34009 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
34010 EVT VT = N0.getValueType();
34012 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
34013 // since the result of setcc_c is all zero's or all ones.
34014 if (VT.isInteger() && !VT.isVector() &&
34015 N1C && N0.getOpcode() == ISD::AND &&
34016 N0.getOperand(1).getOpcode() == ISD::Constant) {
34017 SDValue N00 = N0.getOperand(0);
34018 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
34019 Mask <<= N1C->getAPIntValue();
34020 bool MaskOK = false;
34021 // We can handle cases concerning bit-widening nodes containing setcc_c if
34022 // we carefully interrogate the mask to make sure we are semantics
34024 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
34025 // of the underlying setcc_c operation if the setcc_c was zero extended.
34026 // Consider the following example:
34027 // zext(setcc_c) -> i32 0x0000FFFF
34028 // c1 -> i32 0x0000FFFF
34029 // c2 -> i32 0x00000001
34030 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
34031 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
34032 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34034 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
34035 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34037 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
34038 N00.getOpcode() == ISD::ANY_EXTEND) &&
34039 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34040 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
34042 if (MaskOK && Mask != 0) {
34044 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
34048 // Hardware support for vector shifts is sparse which makes us scalarize the
34049 // vector operations in many cases. Also, on sandybridge ADD is faster than
34051 // (shl V, 1) -> add V,V
34052 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
34053 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
34054 assert(N0.getValueType().isVector() && "Invalid vector shift type");
34055 // We shift all of the values by one. In many cases we do not have
34056 // hardware support for this operation. This is better expressed as an ADD
34058 if (N1SplatC->getAPIntValue() == 1)
34059 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
34065 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
34066 SDValue N0 = N->getOperand(0);
34067 SDValue N1 = N->getOperand(1);
34068 EVT VT = N0.getValueType();
34069 unsigned Size = VT.getSizeInBits();
34071 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
34072 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
34073 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
34074 // depending on sign of (SarConst - [56,48,32,24,16])
34076 // sexts in X86 are MOVs. The MOVs have the same code size
34077 // as above SHIFTs (only SHIFT on 1 has lower code size).
34078 // However the MOVs have 2 advantages to a SHIFT:
34079 // 1. MOVs can write to a register that differs from source
34080 // 2. MOVs accept memory operands
34082 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
34083 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
34084 N0.getOperand(1).getOpcode() != ISD::Constant)
34087 SDValue N00 = N0.getOperand(0);
34088 SDValue N01 = N0.getOperand(1);
34089 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
34090 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
34091 EVT CVT = N1.getValueType();
34093 if (SarConst.isNegative())
34096 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
34097 unsigned ShiftSize = SVT.getSizeInBits();
34098 // skipping types without corresponding sext/zext and
34099 // ShlConst that is not one of [56,48,32,24,16]
34100 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
34104 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
34105 SarConst = SarConst - (Size - ShiftSize);
34108 else if (SarConst.isNegative())
34109 return DAG.getNode(ISD::SHL, DL, VT, NN,
34110 DAG.getConstant(-SarConst, DL, CVT));
34112 return DAG.getNode(ISD::SRA, DL, VT, NN,
34113 DAG.getConstant(SarConst, DL, CVT));
34118 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
34119 TargetLowering::DAGCombinerInfo &DCI) {
34120 SDValue N0 = N->getOperand(0);
34121 SDValue N1 = N->getOperand(1);
34122 EVT VT = N0.getValueType();
34124 // Only do this on the last DAG combine as it can interfere with other
34126 if (!DCI.isAfterLegalizeDAG())
34129 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
34130 // TODO: This is a generic DAG combine that became an x86-only combine to
34131 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
34132 // and-not ('andn').
34133 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
34136 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
34137 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
34138 if (!ShiftC || !AndC)
34141 // If we can shrink the constant mask below 8-bits or 32-bits, then this
34142 // transform should reduce code size. It may also enable secondary transforms
34143 // from improved known-bits analysis or instruction selection.
34144 APInt MaskVal = AndC->getAPIntValue();
34146 // If this can be matched by a zero extend, don't optimize.
34147 if (MaskVal.isMask()) {
34148 unsigned TO = MaskVal.countTrailingOnes();
34149 if (TO >= 8 && isPowerOf2_32(TO))
34153 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
34154 unsigned OldMaskSize = MaskVal.getMinSignedBits();
34155 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
34156 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
34157 (OldMaskSize > 32 && NewMaskSize <= 32)) {
34158 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
34160 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
34161 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
34162 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
34167 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
34168 TargetLowering::DAGCombinerInfo &DCI,
34169 const X86Subtarget &Subtarget) {
34170 if (N->getOpcode() == ISD::SHL)
34171 if (SDValue V = combineShiftLeft(N, DAG))
34174 if (N->getOpcode() == ISD::SRA)
34175 if (SDValue V = combineShiftRightArithmetic(N, DAG))
34178 if (N->getOpcode() == ISD::SRL)
34179 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
34185 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
34186 TargetLowering::DAGCombinerInfo &DCI,
34187 const X86Subtarget &Subtarget) {
34188 unsigned Opcode = N->getOpcode();
34189 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
34190 "Unexpected shift opcode");
34192 EVT VT = N->getValueType(0);
34193 SDValue N0 = N->getOperand(0);
34194 SDValue N1 = N->getOperand(1);
34195 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
34196 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
34197 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
34198 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
34199 "Unexpected PACKSS/PACKUS input type");
34201 // Constant Folding.
34202 APInt UndefElts0, UndefElts1;
34203 SmallVector<APInt, 32> EltBits0, EltBits1;
34204 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
34205 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
34206 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
34207 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
34208 unsigned NumLanes = VT.getSizeInBits() / 128;
34209 unsigned NumDstElts = VT.getVectorNumElements();
34210 unsigned NumSrcElts = NumDstElts / 2;
34211 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
34212 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
34213 bool IsSigned = (X86ISD::PACKSS == Opcode);
34215 APInt Undefs(NumDstElts, 0);
34216 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
34217 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
34218 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
34219 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
34220 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
34221 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
34223 if (UndefElts[SrcIdx]) {
34224 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
34228 APInt &Val = EltBits[SrcIdx];
34230 // PACKSS: Truncate signed value with signed saturation.
34231 // Source values less than dst minint are saturated to minint.
34232 // Source values greater than dst maxint are saturated to maxint.
34233 if (Val.isSignedIntN(DstBitsPerElt))
34234 Val = Val.trunc(DstBitsPerElt);
34235 else if (Val.isNegative())
34236 Val = APInt::getSignedMinValue(DstBitsPerElt);
34238 Val = APInt::getSignedMaxValue(DstBitsPerElt);
34240 // PACKUS: Truncate signed value with unsigned saturation.
34241 // Source values less than zero are saturated to zero.
34242 // Source values greater than dst maxuint are saturated to maxuint.
34243 if (Val.isIntN(DstBitsPerElt))
34244 Val = Val.trunc(DstBitsPerElt);
34245 else if (Val.isNegative())
34246 Val = APInt::getNullValue(DstBitsPerElt);
34248 Val = APInt::getAllOnesValue(DstBitsPerElt);
34250 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
34254 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
34257 // Attempt to combine as shuffle.
34260 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34261 /*HasVarMask*/ false, DAG, Subtarget))
34267 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
34268 TargetLowering::DAGCombinerInfo &DCI,
34269 const X86Subtarget &Subtarget) {
34270 unsigned Opcode = N->getOpcode();
34271 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
34272 X86ISD::VSRLI == Opcode) &&
34273 "Unexpected shift opcode");
34274 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
34275 EVT VT = N->getValueType(0);
34276 SDValue N0 = N->getOperand(0);
34277 SDValue N1 = N->getOperand(1);
34278 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
34279 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
34280 "Unexpected value type");
34282 // Out of range logical bit shifts are guaranteed to be zero.
34283 // Out of range arithmetic bit shifts splat the sign bit.
34284 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
34285 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
34287 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34289 ShiftVal = NumBitsPerElt - 1;
34292 // Shift N0 by zero -> N0.
34296 // Shift zero -> zero.
34297 if (ISD::isBuildVectorAllZeros(N0.getNode()))
34298 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34300 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
34301 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
34302 // TODO - support other sra opcodes as needed.
34303 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
34304 N0.getOpcode() == X86ISD::VSRAI)
34305 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
34307 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34308 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
34309 N1 == N0.getOperand(1)) {
34310 SDValue N00 = N0.getOperand(0);
34311 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
34312 if (ShiftVal.ult(NumSignBits))
34316 // We can decode 'whole byte' logical bit shifts as shuffles.
34317 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
34319 if (SDValue Res = combineX86ShufflesRecursively(
34320 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34321 /*HasVarMask*/ false, DAG, Subtarget))
34325 // Constant Folding.
34327 SmallVector<APInt, 32> EltBits;
34328 if (N->isOnlyUserOf(N0.getNode()) &&
34329 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
34330 assert(EltBits.size() == VT.getVectorNumElements() &&
34331 "Unexpected shift value type");
34332 unsigned ShiftImm = ShiftVal.getZExtValue();
34333 for (APInt &Elt : EltBits) {
34334 if (X86ISD::VSHLI == Opcode)
34336 else if (X86ISD::VSRAI == Opcode)
34337 Elt.ashrInPlace(ShiftImm);
34339 Elt.lshrInPlace(ShiftImm);
34341 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
34347 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
34348 TargetLowering::DAGCombinerInfo &DCI,
34349 const X86Subtarget &Subtarget) {
34351 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
34352 (N->getOpcode() == X86ISD::PINSRW &&
34353 N->getValueType(0) == MVT::v8i16)) &&
34354 "Unexpected vector insertion");
34356 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
34359 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34360 /*HasVarMask*/ false, DAG, Subtarget))
34366 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
34367 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
34368 /// OR -> CMPNEQSS.
34369 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
34370 TargetLowering::DAGCombinerInfo &DCI,
34371 const X86Subtarget &Subtarget) {
34374 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
34375 // we're requiring SSE2 for both.
34376 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
34377 SDValue N0 = N->getOperand(0);
34378 SDValue N1 = N->getOperand(1);
34379 SDValue CMP0 = N0->getOperand(1);
34380 SDValue CMP1 = N1->getOperand(1);
34383 // The SETCCs should both refer to the same CMP.
34384 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
34387 SDValue CMP00 = CMP0->getOperand(0);
34388 SDValue CMP01 = CMP0->getOperand(1);
34389 EVT VT = CMP00.getValueType();
34391 if (VT == MVT::f32 || VT == MVT::f64) {
34392 bool ExpectingFlags = false;
34393 // Check for any users that want flags:
34394 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
34395 !ExpectingFlags && UI != UE; ++UI)
34396 switch (UI->getOpcode()) {
34401 ExpectingFlags = true;
34403 case ISD::CopyToReg:
34404 case ISD::SIGN_EXTEND:
34405 case ISD::ZERO_EXTEND:
34406 case ISD::ANY_EXTEND:
34410 if (!ExpectingFlags) {
34411 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
34412 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
34414 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
34415 X86::CondCode tmp = cc0;
34420 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
34421 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
34422 // FIXME: need symbolic constants for these magic numbers.
34423 // See X86ATTInstPrinter.cpp:printSSECC().
34424 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
34425 if (Subtarget.hasAVX512()) {
34427 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
34428 DAG.getConstant(x86cc, DL, MVT::i8));
34429 // Need to fill with zeros to ensure the bitcast will produce zeroes
34430 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
34431 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
34432 DAG.getConstant(0, DL, MVT::v16i1),
34433 FSetCC, DAG.getIntPtrConstant(0, DL));
34434 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
34435 N->getSimpleValueType(0));
34437 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
34438 CMP00.getValueType(), CMP00, CMP01,
34439 DAG.getConstant(x86cc, DL,
34442 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
34443 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
34445 if (is64BitFP && !Subtarget.is64Bit()) {
34446 // On a 32-bit target, we cannot bitcast the 64-bit float to a
34447 // 64-bit integer, since that's not a legal type. Since
34448 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
34449 // bits, but can do this little dance to extract the lowest 32 bits
34450 // and work with those going forward.
34451 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
34453 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
34454 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
34455 Vector32, DAG.getIntPtrConstant(0, DL));
34459 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
34460 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
34461 DAG.getConstant(1, DL, IntVT));
34462 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
34464 return OneBitOfTruth;
34472 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
34473 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
34474 if (N->getOpcode() != ISD::AND)
34477 SDValue N0 = N->getOperand(0);
34478 SDValue N1 = N->getOperand(1);
34479 if (N0.getOpcode() == ISD::XOR &&
34480 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
34481 X = N0.getOperand(0);
34485 if (N1.getOpcode() == ISD::XOR &&
34486 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
34487 X = N1.getOperand(0);
34495 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
34496 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
34497 assert(N->getOpcode() == ISD::AND);
34499 EVT VT = N->getValueType(0);
34500 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
34504 if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
34505 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
34510 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
34511 // register. In most cases we actually compare or select YMM-sized registers
34512 // and mixing the two types creates horrible code. This method optimizes
34513 // some of the transition sequences.
34514 // Even with AVX-512 this is still useful for removing casts around logical
34515 // operations on vXi1 mask types.
34516 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
34517 const X86Subtarget &Subtarget) {
34518 EVT VT = N->getValueType(0);
34519 assert(VT.isVector() && "Expected vector type");
34521 assert((N->getOpcode() == ISD::ANY_EXTEND ||
34522 N->getOpcode() == ISD::ZERO_EXTEND ||
34523 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
34525 SDValue Narrow = N->getOperand(0);
34526 EVT NarrowVT = Narrow.getValueType();
34528 if (Narrow->getOpcode() != ISD::XOR &&
34529 Narrow->getOpcode() != ISD::AND &&
34530 Narrow->getOpcode() != ISD::OR)
34533 SDValue N0 = Narrow->getOperand(0);
34534 SDValue N1 = Narrow->getOperand(1);
34537 // The Left side has to be a trunc.
34538 if (N0.getOpcode() != ISD::TRUNCATE)
34541 // The type of the truncated inputs.
34542 if (N0->getOperand(0).getValueType() != VT)
34545 // The right side has to be a 'trunc' or a constant vector.
34546 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
34547 N1.getOperand(0).getValueType() == VT;
34549 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
34552 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34554 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
34557 // Set N0 and N1 to hold the inputs to the new wide operation.
34558 N0 = N0->getOperand(0);
34560 N1 = N1->getOperand(0);
34562 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
34564 // Generate the wide operation.
34565 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
34566 unsigned Opcode = N->getOpcode();
34568 default: llvm_unreachable("Unexpected opcode");
34569 case ISD::ANY_EXTEND:
34571 case ISD::ZERO_EXTEND:
34572 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
34573 case ISD::SIGN_EXTEND:
34574 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
34575 Op, DAG.getValueType(NarrowVT));
34579 /// If both input operands of a logic op are being cast from floating point
34580 /// types, try to convert this into a floating point logic node to avoid
34581 /// unnecessary moves from SSE to integer registers.
34582 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
34583 const X86Subtarget &Subtarget) {
34584 unsigned FPOpcode = ISD::DELETED_NODE;
34585 if (N->getOpcode() == ISD::AND)
34586 FPOpcode = X86ISD::FAND;
34587 else if (N->getOpcode() == ISD::OR)
34588 FPOpcode = X86ISD::FOR;
34589 else if (N->getOpcode() == ISD::XOR)
34590 FPOpcode = X86ISD::FXOR;
34592 assert(FPOpcode != ISD::DELETED_NODE &&
34593 "Unexpected input node for FP logic conversion");
34595 EVT VT = N->getValueType(0);
34596 SDValue N0 = N->getOperand(0);
34597 SDValue N1 = N->getOperand(1);
34599 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
34600 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
34601 (Subtarget.hasSSE2() && VT == MVT::i64))) {
34602 SDValue N00 = N0.getOperand(0);
34603 SDValue N10 = N1.getOperand(0);
34604 EVT N00Type = N00.getValueType();
34605 EVT N10Type = N10.getValueType();
34606 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
34607 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
34608 return DAG.getBitcast(VT, FPLogic);
34614 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
34615 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
34616 /// with a shift-right to eliminate loading the vector constant mask value.
34617 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
34618 const X86Subtarget &Subtarget) {
34619 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
34620 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
34621 EVT VT0 = Op0.getValueType();
34622 EVT VT1 = Op1.getValueType();
34624 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
34628 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
34629 !SplatVal.isMask())
34632 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
34635 unsigned EltBitWidth = VT0.getScalarSizeInBits();
34636 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
34640 unsigned ShiftVal = SplatVal.countTrailingOnes();
34641 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
34642 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
34643 return DAG.getBitcast(N->getValueType(0), Shift);
34646 // Get the index node from the lowered DAG of a GEP IR instruction with one
34647 // indexing dimension.
34648 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
34649 if (Ld->isIndexed())
34652 SDValue Base = Ld->getBasePtr();
34654 if (Base.getOpcode() != ISD::ADD)
34657 SDValue ShiftedIndex = Base.getOperand(0);
34659 if (ShiftedIndex.getOpcode() != ISD::SHL)
34662 return ShiftedIndex.getOperand(0);
34666 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
34667 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
34668 switch (VT.getSizeInBits()) {
34669 default: return false;
34670 case 64: return Subtarget.is64Bit() ? true : false;
34671 case 32: return true;
34677 // This function recognizes cases where X86 bzhi instruction can replace and
34678 // 'and-load' sequence.
34679 // In case of loading integer value from an array of constants which is defined
34682 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
34684 // then applying a bitwise and on the result with another input.
34685 // It's equivalent to performing bzhi (zero high bits) on the input, with the
34686 // same index of the load.
34687 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
34688 const X86Subtarget &Subtarget) {
34689 MVT VT = Node->getSimpleValueType(0);
34692 // Check if subtarget has BZHI instruction for the node's type
34693 if (!hasBZHI(Subtarget, VT))
34696 // Try matching the pattern for both operands.
34697 for (unsigned i = 0; i < 2; i++) {
34698 SDValue N = Node->getOperand(i);
34699 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
34701 // continue if the operand is not a load instruction
34705 const Value *MemOp = Ld->getMemOperand()->getValue();
34710 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
34711 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
34712 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
34714 Constant *Init = GV->getInitializer();
34715 Type *Ty = Init->getType();
34716 if (!isa<ConstantDataArray>(Init) ||
34717 !Ty->getArrayElementType()->isIntegerTy() ||
34718 Ty->getArrayElementType()->getScalarSizeInBits() !=
34719 VT.getSizeInBits() ||
34720 Ty->getArrayNumElements() >
34721 Ty->getArrayElementType()->getScalarSizeInBits())
34724 // Check if the array's constant elements are suitable to our case.
34725 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34726 bool ConstantsMatch = true;
34727 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34728 ConstantInt *Elem =
34729 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34730 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34731 ConstantsMatch = false;
34735 if (!ConstantsMatch)
34738 // Do the transformation (For 32-bit type):
34739 // -> (and (load arr[idx]), inp)
34740 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34741 // that will be replaced with one bzhi instruction.
34742 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34743 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
34745 // Get the Node which indexes into the array.
34746 SDValue Index = getIndexFromUnindexedLoad(Ld);
34749 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
34751 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
34752 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
34754 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34755 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34757 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34765 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34766 TargetLowering::DAGCombinerInfo &DCI,
34767 const X86Subtarget &Subtarget) {
34768 EVT VT = N->getValueType(0);
34770 // If this is SSE1 only convert to FAND to avoid scalarization.
34771 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34772 return DAG.getBitcast(
34773 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34774 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34775 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34778 // Use a 32-bit and+zext if upper bits known zero.
34779 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34780 !isa<ConstantSDNode>(N->getOperand(1))) {
34781 APInt HiMask = APInt::getHighBitsSet(64, 32);
34782 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34783 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34785 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34786 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34787 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34788 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34792 if (DCI.isBeforeLegalizeOps())
34795 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34798 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34801 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34804 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34807 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34810 // Attempt to recursively combine a bitmask AND with shuffles.
34811 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34813 if (SDValue Res = combineX86ShufflesRecursively(
34814 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34815 /*HasVarMask*/ false, DAG, Subtarget))
34819 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34820 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34821 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34822 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34823 SDValue BitMask = N->getOperand(1);
34824 SDValue SrcVec = N->getOperand(0).getOperand(0);
34825 EVT SrcVecVT = SrcVec.getValueType();
34827 // Check that the constant bitmask masks whole bytes.
34829 SmallVector<APInt, 64> EltBits;
34830 if (VT == SrcVecVT.getScalarType() &&
34831 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34832 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34833 llvm::all_of(EltBits, [](APInt M) {
34834 return M.isNullValue() || M.isAllOnesValue();
34836 unsigned NumElts = SrcVecVT.getVectorNumElements();
34837 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34838 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34840 // Create a root shuffle mask from the byte mask and the extracted index.
34841 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34842 for (unsigned i = 0; i != Scale; ++i) {
34845 int VecIdx = Scale * Idx + i;
34846 ShuffleMask[VecIdx] =
34847 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34850 if (SDValue Shuffle = combineX86ShufflesRecursively(
34851 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34852 /*HasVarMask*/ false, DAG, Subtarget))
34853 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34854 N->getOperand(0).getOperand(1));
34861 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
34862 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
34863 if (N->getOpcode() != ISD::OR)
34866 SDValue N0 = N->getOperand(0);
34867 SDValue N1 = N->getOperand(1);
34869 // Canonicalize AND to LHS.
34870 if (N1.getOpcode() == ISD::AND)
34873 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
34874 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34877 Mask = N1.getOperand(0);
34878 X = N1.getOperand(1);
34880 // Check to see if the mask appeared in both the AND and ANDNP.
34881 if (N0.getOperand(0) == Mask)
34882 Y = N0.getOperand(1);
34883 else if (N0.getOperand(1) == Mask)
34884 Y = N0.getOperand(0);
34888 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
34889 // ANDNP combine allows other combines to happen that prevent matching.
34894 // (or (and (m, y), (pandn m, x)))
34896 // (vselect m, x, y)
34897 // As a special case, try to fold:
34898 // (or (and (m, (sub 0, x)), (pandn m, x)))
34900 // (sub (xor X, M), M)
34901 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34902 const X86Subtarget &Subtarget) {
34903 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34905 EVT VT = N->getValueType(0);
34906 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34907 (VT.is256BitVector() && Subtarget.hasInt256())))
34910 SDValue X, Y, Mask;
34911 if (!matchLogicBlend(N, X, Y, Mask))
34914 // Validate that X, Y, and Mask are bitcasts, and see through them.
34915 Mask = peekThroughBitcasts(Mask);
34916 X = peekThroughBitcasts(X);
34917 Y = peekThroughBitcasts(Y);
34919 EVT MaskVT = Mask.getValueType();
34920 unsigned EltBits = MaskVT.getScalarSizeInBits();
34922 // TODO: Attempt to handle floating point cases as well?
34923 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34929 // (or (and (M, (sub 0, X)), (pandn M, X)))
34930 // which is a special case of vselect:
34931 // (vselect M, (sub 0, X), X)
34933 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34934 // We know that, if fNegate is 0 or 1:
34935 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34937 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34938 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34939 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34940 // This lets us transform our vselect to:
34941 // (add (xor X, M), (and M, 1))
34943 // (sub (xor X, M), M)
34944 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34945 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34946 auto IsNegV = [](SDNode *N, SDValue V) {
34947 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34948 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34951 if (IsNegV(Y.getNode(), X))
34953 else if (IsNegV(X.getNode(), Y))
34957 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34958 SDValue SubOp2 = Mask;
34960 // If the negate was on the false side of the select, then
34961 // the operands of the SUB need to be swapped. PR 27251.
34962 // This is because the pattern being matched above is
34963 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34964 // but if the pattern matched was
34965 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34966 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34967 // pattern also needs to be a negation of the replacement pattern above.
34968 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34969 // sub accomplishes the negation of the replacement pattern.
34971 std::swap(SubOp1, SubOp2);
34973 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34974 return DAG.getBitcast(VT, Res);
34978 // PBLENDVB is only available on SSE 4.1.
34979 if (!Subtarget.hasSSE41())
34982 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34984 X = DAG.getBitcast(BlendVT, X);
34985 Y = DAG.getBitcast(BlendVT, Y);
34986 Mask = DAG.getBitcast(BlendVT, Mask);
34987 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34988 return DAG.getBitcast(VT, Mask);
34991 // Helper function for combineOrCmpEqZeroToCtlzSrl
34995 // srl(ctlz x), log2(bitsize(x))
34996 // Input pattern is checked by caller.
34997 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34998 SelectionDAG &DAG) {
34999 SDValue Cmp = Op.getOperand(1);
35000 EVT VT = Cmp.getOperand(0).getValueType();
35001 unsigned Log2b = Log2_32(VT.getSizeInBits());
35003 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
35004 // The result of the shift is true or false, and on X86, the 32-bit
35005 // encoding of shr and lzcnt is more desirable.
35006 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
35007 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
35008 DAG.getConstant(Log2b, dl, MVT::i8));
35009 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
35012 // Try to transform:
35013 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
35015 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
35016 // Will also attempt to match more generic cases, eg:
35017 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
35018 // Only applies if the target supports the FastLZCNT feature.
35019 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
35020 TargetLowering::DAGCombinerInfo &DCI,
35021 const X86Subtarget &Subtarget) {
35022 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
35025 auto isORCandidate = [](SDValue N) {
35026 return (N->getOpcode() == ISD::OR && N->hasOneUse());
35029 // Check the zero extend is extending to 32-bit or more. The code generated by
35030 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
35031 // instructions to clear the upper bits.
35032 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
35033 !isORCandidate(N->getOperand(0)))
35036 // Check the node matches: setcc(eq, cmp 0)
35037 auto isSetCCCandidate = [](SDValue N) {
35038 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
35039 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
35040 N->getOperand(1).getOpcode() == X86ISD::CMP &&
35041 isNullConstant(N->getOperand(1).getOperand(1)) &&
35042 N->getOperand(1).getValueType().bitsGE(MVT::i32);
35045 SDNode *OR = N->getOperand(0).getNode();
35046 SDValue LHS = OR->getOperand(0);
35047 SDValue RHS = OR->getOperand(1);
35049 // Save nodes matching or(or, setcc(eq, cmp 0)).
35050 SmallVector<SDNode *, 2> ORNodes;
35051 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
35052 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
35053 ORNodes.push_back(OR);
35054 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
35055 LHS = OR->getOperand(0);
35056 RHS = OR->getOperand(1);
35059 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
35060 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
35061 !isORCandidate(SDValue(OR, 0)))
35064 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
35066 // or(srl(ctlz),srl(ctlz)).
35067 // The dag combiner can then fold it into:
35068 // srl(or(ctlz, ctlz)).
35069 EVT VT = OR->getValueType(0);
35070 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
35071 SDValue Ret, NewRHS;
35072 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
35073 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
35078 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
35079 while (ORNodes.size() > 0) {
35080 OR = ORNodes.pop_back_val();
35081 LHS = OR->getOperand(0);
35082 RHS = OR->getOperand(1);
35083 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
35084 if (RHS->getOpcode() == ISD::OR)
35085 std::swap(LHS, RHS);
35086 EVT VT = OR->getValueType(0);
35087 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
35090 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
35094 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
35099 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
35100 TargetLowering::DAGCombinerInfo &DCI,
35101 const X86Subtarget &Subtarget) {
35102 SDValue N0 = N->getOperand(0);
35103 SDValue N1 = N->getOperand(1);
35104 EVT VT = N->getValueType(0);
35106 // If this is SSE1 only convert to FOR to avoid scalarization.
35107 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
35108 return DAG.getBitcast(MVT::v4i32,
35109 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
35110 DAG.getBitcast(MVT::v4f32, N0),
35111 DAG.getBitcast(MVT::v4f32, N1)));
35114 if (DCI.isBeforeLegalizeOps())
35117 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
35120 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35123 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
35126 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
35129 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
35130 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
35132 // SHLD/SHRD instructions have lower register pressure, but on some
35133 // platforms they have higher latency than the equivalent
35134 // series of shifts/or that would otherwise be generated.
35135 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
35136 // have higher latencies and we are not optimizing for size.
35137 if (!OptForSize && Subtarget.isSHLDSlow())
35140 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
35142 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
35144 if (!N0.hasOneUse() || !N1.hasOneUse())
35147 SDValue ShAmt0 = N0.getOperand(1);
35148 if (ShAmt0.getValueType() != MVT::i8)
35150 SDValue ShAmt1 = N1.getOperand(1);
35151 if (ShAmt1.getValueType() != MVT::i8)
35153 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
35154 ShAmt0 = ShAmt0.getOperand(0);
35155 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
35156 ShAmt1 = ShAmt1.getOperand(0);
35159 unsigned Opc = X86ISD::SHLD;
35160 SDValue Op0 = N0.getOperand(0);
35161 SDValue Op1 = N1.getOperand(0);
35162 if (ShAmt0.getOpcode() == ISD::SUB ||
35163 ShAmt0.getOpcode() == ISD::XOR) {
35164 Opc = X86ISD::SHRD;
35165 std::swap(Op0, Op1);
35166 std::swap(ShAmt0, ShAmt1);
35169 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
35170 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
35171 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
35172 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
35173 unsigned Bits = VT.getSizeInBits();
35174 if (ShAmt1.getOpcode() == ISD::SUB) {
35175 SDValue Sum = ShAmt1.getOperand(0);
35176 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
35177 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
35178 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
35179 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
35180 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
35181 return DAG.getNode(Opc, DL, VT,
35183 DAG.getNode(ISD::TRUNCATE, DL,
35186 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
35187 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
35188 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
35189 return DAG.getNode(Opc, DL, VT,
35190 N0.getOperand(0), N1.getOperand(0),
35191 DAG.getNode(ISD::TRUNCATE, DL,
35193 } else if (ShAmt1.getOpcode() == ISD::XOR) {
35194 SDValue Mask = ShAmt1.getOperand(1);
35195 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
35196 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
35197 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
35198 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
35199 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
35200 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
35201 if (Op1.getOpcode() == InnerShift &&
35202 isa<ConstantSDNode>(Op1.getOperand(1)) &&
35203 Op1.getConstantOperandVal(1) == 1) {
35204 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35205 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35207 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
35208 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
35209 Op1.getOperand(0) == Op1.getOperand(1)) {
35210 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35211 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35220 /// Try to turn tests against the signbit in the form of:
35221 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
35224 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
35225 // This is only worth doing if the output type is i8 or i1.
35226 EVT ResultType = N->getValueType(0);
35227 if (ResultType != MVT::i8 && ResultType != MVT::i1)
35230 SDValue N0 = N->getOperand(0);
35231 SDValue N1 = N->getOperand(1);
35233 // We should be performing an xor against a truncated shift.
35234 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
35237 // Make sure we are performing an xor against one.
35238 if (!isOneConstant(N1))
35241 // SetCC on x86 zero extends so only act on this if it's a logical shift.
35242 SDValue Shift = N0.getOperand(0);
35243 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
35246 // Make sure we are truncating from one of i16, i32 or i64.
35247 EVT ShiftTy = Shift.getValueType();
35248 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
35251 // Make sure the shift amount extracts the sign bit.
35252 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
35253 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
35256 // Create a greater-than comparison against -1.
35257 // N.B. Using SETGE against 0 works but we want a canonical looking
35258 // comparison, using SETGT matches up with what TranslateX86CC.
35260 SDValue ShiftOp = Shift.getOperand(0);
35261 EVT ShiftOpTy = ShiftOp.getValueType();
35262 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35263 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
35264 *DAG.getContext(), ResultType);
35265 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
35266 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
35267 if (SetCCResultType != ResultType)
35268 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
35272 /// Turn vector tests of the signbit in the form of:
35273 /// xor (sra X, elt_size(X)-1), -1
35277 /// This should be called before type legalization because the pattern may not
35278 /// persist after that.
35279 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
35280 const X86Subtarget &Subtarget) {
35281 EVT VT = N->getValueType(0);
35282 if (!VT.isSimple())
35285 switch (VT.getSimpleVT().SimpleTy) {
35286 default: return SDValue();
35289 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
35290 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
35294 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
35297 // There must be a shift right algebraic before the xor, and the xor must be a
35298 // 'not' operation.
35299 SDValue Shift = N->getOperand(0);
35300 SDValue Ones = N->getOperand(1);
35301 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
35302 !ISD::isBuildVectorAllOnes(Ones.getNode()))
35305 // The shift should be smearing the sign bit across each vector element.
35306 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
35310 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
35311 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
35312 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
35315 // Create a greater-than comparison against -1. We don't use the more obvious
35316 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
35317 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
35320 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
35321 /// is valid for the given \p Subtarget.
35322 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
35323 const X86Subtarget &Subtarget) {
35324 if (!Subtarget.hasAVX512())
35327 // FIXME: Scalar type may be supported if we move it to vector register.
35328 if (!SrcVT.isVector())
35331 EVT SrcElVT = SrcVT.getScalarType();
35332 EVT DstElVT = DstVT.getScalarType();
35333 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
35335 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
35336 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
35340 /// Detect patterns of truncation with unsigned saturation:
35342 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35343 /// Return the source value x to be truncated or SDValue() if the pattern was
35346 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
35347 /// where C1 >= 0 and C2 is unsigned max of destination type.
35349 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
35350 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
35352 /// These two patterns are equivalent to:
35353 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
35354 /// So return the smax(x, C1) value to be truncated or SDValue() if the
35355 /// pattern was not matched.
35356 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35358 EVT InVT = In.getValueType();
35360 // Saturation with truncation. We truncate from InVT to VT.
35361 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
35362 "Unexpected types for truncate operation");
35364 // Match min/max and return limit value as a parameter.
35365 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
35366 if (V.getOpcode() == Opcode &&
35367 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
35368 return V.getOperand(0);
35373 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
35374 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
35375 // the element size of the destination type.
35376 if (C2.isMask(VT.getScalarSizeInBits()))
35379 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
35380 if (MatchMinMax(SMin, ISD::SMAX, C1))
35381 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
35384 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
35385 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
35386 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
35388 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
35394 /// Detect patterns of truncation with signed saturation:
35395 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
35396 /// signed_max_of_dest_type)) to dest_type)
35398 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
35399 /// signed_min_of_dest_type)) to dest_type).
35400 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
35401 /// Return the source value to be truncated or SDValue() if the pattern was not
35403 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
35404 unsigned NumDstBits = VT.getScalarSizeInBits();
35405 unsigned NumSrcBits = In.getScalarValueSizeInBits();
35406 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
35408 auto MatchMinMax = [](SDValue V, unsigned Opcode,
35409 const APInt &Limit) -> SDValue {
35411 if (V.getOpcode() == Opcode &&
35412 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
35413 return V.getOperand(0);
35417 APInt SignedMax, SignedMin;
35419 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
35420 SignedMin = APInt(NumSrcBits, 0);
35422 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
35423 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
35426 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
35427 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
35430 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
35431 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
35437 /// Detect a pattern of truncation with signed saturation.
35438 /// The types should allow to use VPMOVSS* instruction on AVX512.
35439 /// Return the source value to be truncated or SDValue() if the pattern was not
35441 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
35442 const X86Subtarget &Subtarget,
35443 const TargetLowering &TLI) {
35444 if (!TLI.isTypeLegal(In.getValueType()))
35446 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35448 return detectSSatPattern(In, VT);
35451 /// Detect a pattern of truncation with saturation:
35452 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35453 /// The types should allow to use VPMOVUS* instruction on AVX512.
35454 /// Return the source value to be truncated or SDValue() if the pattern was not
35456 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35458 const X86Subtarget &Subtarget,
35459 const TargetLowering &TLI) {
35460 if (!TLI.isTypeLegal(In.getValueType()))
35462 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35464 return detectUSatPattern(In, VT, DAG, DL);
35467 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
35469 const X86Subtarget &Subtarget) {
35470 EVT SVT = VT.getScalarType();
35471 EVT InVT = In.getValueType();
35472 EVT InSVT = InVT.getScalarType();
35473 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35474 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
35475 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
35476 if (auto SSatVal = detectSSatPattern(In, VT))
35477 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
35478 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
35479 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
35481 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
35482 (SVT == MVT::i8 || SVT == MVT::i16) &&
35483 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
35484 if (auto USatVal = detectSSatPattern(In, VT, true)) {
35485 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
35486 if (SVT == MVT::i8 && InSVT == MVT::i32) {
35487 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35488 VT.getVectorNumElements());
35489 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
35492 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
35494 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
35495 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
35498 if (auto SSatVal = detectSSatPattern(In, VT))
35499 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
35505 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
35506 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
35507 /// X86ISD::AVG instruction.
35508 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35509 const X86Subtarget &Subtarget,
35511 if (!VT.isVector())
35513 EVT InVT = In.getValueType();
35514 unsigned NumElems = VT.getVectorNumElements();
35516 EVT ScalarVT = VT.getVectorElementType();
35517 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
35518 isPowerOf2_32(NumElems)))
35521 // InScalarVT is the intermediate type in AVG pattern and it should be greater
35522 // than the original input type (i8/i16).
35523 EVT InScalarVT = InVT.getVectorElementType();
35524 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
35527 if (!Subtarget.hasSSE2())
35530 // Detect the following pattern:
35532 // %1 = zext <N x i8> %a to <N x i32>
35533 // %2 = zext <N x i8> %b to <N x i32>
35534 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
35535 // %4 = add nuw nsw <N x i32> %3, %2
35536 // %5 = lshr <N x i32> %N, <i32 1 x N>
35537 // %6 = trunc <N x i32> %5 to <N x i8>
35539 // In AVX512, the last instruction can also be a trunc store.
35540 if (In.getOpcode() != ISD::SRL)
35543 // A lambda checking the given SDValue is a constant vector and each element
35544 // is in the range [Min, Max].
35545 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
35546 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
35547 if (!BV || !BV->isConstant())
35549 for (SDValue Op : V->ops()) {
35550 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
35553 const APInt &Val = C->getAPIntValue();
35554 if (Val.ult(Min) || Val.ugt(Max))
35560 // Check if each element of the vector is left-shifted by one.
35561 auto LHS = In.getOperand(0);
35562 auto RHS = In.getOperand(1);
35563 if (!IsConstVectorInRange(RHS, 1, 1))
35565 if (LHS.getOpcode() != ISD::ADD)
35568 // Detect a pattern of a + b + 1 where the order doesn't matter.
35569 SDValue Operands[3];
35570 Operands[0] = LHS.getOperand(0);
35571 Operands[1] = LHS.getOperand(1);
35573 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35574 ArrayRef<SDValue> Ops) {
35575 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
35578 // Take care of the case when one of the operands is a constant vector whose
35579 // element is in the range [1, 256].
35580 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
35581 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
35582 Operands[0].getOperand(0).getValueType() == VT) {
35583 // The pattern is detected. Subtract one from the constant vector, then
35584 // demote it and emit X86ISD::AVG instruction.
35585 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
35586 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
35587 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
35588 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35589 { Operands[0].getOperand(0), Operands[1] },
35593 if (Operands[0].getOpcode() == ISD::ADD)
35594 std::swap(Operands[0], Operands[1]);
35595 else if (Operands[1].getOpcode() != ISD::ADD)
35597 Operands[2] = Operands[1].getOperand(0);
35598 Operands[1] = Operands[1].getOperand(1);
35600 // Now we have three operands of two additions. Check that one of them is a
35601 // constant vector with ones, and the other two are promoted from i8/i16.
35602 for (int i = 0; i < 3; ++i) {
35603 if (!IsConstVectorInRange(Operands[i], 1, 1))
35605 std::swap(Operands[i], Operands[2]);
35607 // Check if Operands[0] and Operands[1] are results of type promotion.
35608 for (int j = 0; j < 2; ++j)
35609 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
35610 Operands[j].getOperand(0).getValueType() != VT)
35613 // The pattern is detected, emit X86ISD::AVG instruction(s).
35614 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35615 { Operands[0].getOperand(0),
35616 Operands[1].getOperand(0) }, AVGBuilder);
35622 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
35623 TargetLowering::DAGCombinerInfo &DCI,
35624 const X86Subtarget &Subtarget) {
35625 LoadSDNode *Ld = cast<LoadSDNode>(N);
35626 EVT RegVT = Ld->getValueType(0);
35627 EVT MemVT = Ld->getMemoryVT();
35629 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35631 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
35632 // into two 16-byte operations. Also split non-temporal aligned loads on
35633 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
35634 ISD::LoadExtType Ext = Ld->getExtensionType();
35636 unsigned AddressSpace = Ld->getAddressSpace();
35637 unsigned Alignment = Ld->getAlignment();
35638 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
35639 Ext == ISD::NON_EXTLOAD &&
35640 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
35641 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
35642 AddressSpace, Alignment, &Fast) && !Fast))) {
35643 unsigned NumElems = RegVT.getVectorNumElements();
35647 SDValue Ptr = Ld->getBasePtr();
35649 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
35652 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
35653 Alignment, Ld->getMemOperand()->getFlags());
35655 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
35657 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
35658 Ld->getPointerInfo().getWithOffset(16),
35659 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
35660 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
35662 Load2.getValue(1));
35664 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
35665 return DCI.CombineTo(N, NewVec, TF, true);
35671 /// If V is a build vector of boolean constants and exactly one of those
35672 /// constants is true, return the operand index of that true element.
35673 /// Otherwise, return -1.
35674 static int getOneTrueElt(SDValue V) {
35675 // This needs to be a build vector of booleans.
35676 // TODO: Checking for the i1 type matches the IR definition for the mask,
35677 // but the mask check could be loosened to i8 or other types. That might
35678 // also require checking more than 'allOnesValue'; eg, the x86 HW
35679 // instructions only require that the MSB is set for each mask element.
35680 // The ISD::MSTORE comments/definition do not specify how the mask operand
35682 auto *BV = dyn_cast<BuildVectorSDNode>(V);
35683 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
35686 int TrueIndex = -1;
35687 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
35688 for (unsigned i = 0; i < NumElts; ++i) {
35689 const SDValue &Op = BV->getOperand(i);
35692 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
35695 if (ConstNode->getAPIntValue().isAllOnesValue()) {
35696 // If we already found a one, this is too many.
35697 if (TrueIndex >= 0)
35705 /// Given a masked memory load/store operation, return true if it has one mask
35706 /// bit set. If it has one mask bit set, then also return the memory address of
35707 /// the scalar element to load/store, the vector index to insert/extract that
35708 /// scalar element, and the alignment for the scalar memory access.
35709 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
35710 SelectionDAG &DAG, SDValue &Addr,
35711 SDValue &Index, unsigned &Alignment) {
35712 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
35713 if (TrueMaskElt < 0)
35716 // Get the address of the one scalar element that is specified by the mask
35717 // using the appropriate offset from the base pointer.
35718 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
35719 Addr = MaskedOp->getBasePtr();
35720 if (TrueMaskElt != 0) {
35721 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
35722 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
35725 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
35726 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
35730 /// If exactly one element of the mask is set for a non-extending masked load,
35731 /// it is a scalar load and vector insert.
35732 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35733 /// mask have already been optimized in IR, so we don't bother with those here.
35735 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35736 TargetLowering::DAGCombinerInfo &DCI) {
35737 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35738 // However, some target hooks may need to be added to know when the transform
35739 // is profitable. Endianness would also have to be considered.
35741 SDValue Addr, VecIndex;
35742 unsigned Alignment;
35743 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
35746 // Load the one scalar element that is specified by the mask using the
35747 // appropriate offset from the base pointer.
35749 EVT VT = ML->getValueType(0);
35750 EVT EltVT = VT.getVectorElementType();
35752 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
35753 Alignment, ML->getMemOperand()->getFlags());
35755 // Insert the loaded element into the appropriate place in the vector.
35756 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
35758 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
35762 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35763 TargetLowering::DAGCombinerInfo &DCI) {
35764 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35768 EVT VT = ML->getValueType(0);
35770 // If we are loading the first and last elements of a vector, it is safe and
35771 // always faster to load the whole vector. Replace the masked load with a
35772 // vector load and select.
35773 unsigned NumElts = VT.getVectorNumElements();
35774 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35775 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35776 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35777 if (LoadFirstElt && LoadLastElt) {
35778 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35779 ML->getMemOperand());
35780 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35781 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35784 // Convert a masked load with a constant mask into a masked load and a select.
35785 // This allows the select operation to use a faster kind of select instruction
35786 // (for example, vblendvps -> vblendps).
35788 // Don't try this if the pass-through operand is already undefined. That would
35789 // cause an infinite loop because that's what we're about to create.
35790 if (ML->getSrc0().isUndef())
35793 // The new masked load has an undef pass-through operand. The select uses the
35794 // original pass-through operand.
35795 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35796 ML->getMask(), DAG.getUNDEF(VT),
35797 ML->getMemoryVT(), ML->getMemOperand(),
35798 ML->getExtensionType());
35799 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35801 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35804 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35805 TargetLowering::DAGCombinerInfo &DCI,
35806 const X86Subtarget &Subtarget) {
35807 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35809 // TODO: Expanding load with constant mask may be optimized as well.
35810 if (Mld->isExpandingLoad())
35813 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35814 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35816 // TODO: Do some AVX512 subsets benefit from this transform?
35817 if (!Subtarget.hasAVX512())
35818 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35822 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35825 // Resolve extending loads.
35826 EVT VT = Mld->getValueType(0);
35827 unsigned NumElems = VT.getVectorNumElements();
35828 EVT LdVT = Mld->getMemoryVT();
35831 assert(LdVT != VT && "Cannot extend to the same type");
35832 unsigned ToSz = VT.getScalarSizeInBits();
35833 unsigned FromSz = LdVT.getScalarSizeInBits();
35834 // From/To sizes and ElemCount must be pow of two.
35835 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35836 "Unexpected size for extending masked load");
35838 unsigned SizeRatio = ToSz / FromSz;
35839 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35841 // Create a type on which we perform the shuffle.
35842 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35843 LdVT.getScalarType(), NumElems*SizeRatio);
35844 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35846 // Convert Src0 value.
35847 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35848 if (!Mld->getSrc0().isUndef()) {
35849 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35850 for (unsigned i = 0; i != NumElems; ++i)
35851 ShuffleVec[i] = i * SizeRatio;
35853 // Can't shuffle using an illegal type.
35854 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35855 "WideVecVT should be legal");
35856 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35857 DAG.getUNDEF(WideVecVT), ShuffleVec);
35860 // Prepare the new mask.
35862 SDValue Mask = Mld->getMask();
35863 if (Mask.getValueType() == VT) {
35864 // Mask and original value have the same type.
35865 NewMask = DAG.getBitcast(WideVecVT, Mask);
35866 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35867 for (unsigned i = 0; i != NumElems; ++i)
35868 ShuffleVec[i] = i * SizeRatio;
35869 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35870 ShuffleVec[i] = NumElems * SizeRatio;
35871 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35872 DAG.getConstant(0, dl, WideVecVT),
35875 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35876 unsigned WidenNumElts = NumElems*SizeRatio;
35877 unsigned MaskNumElts = VT.getVectorNumElements();
35878 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35881 unsigned NumConcat = WidenNumElts / MaskNumElts;
35882 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35883 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35885 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35888 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35889 Mld->getBasePtr(), NewMask, WideSrc0,
35890 Mld->getMemoryVT(), Mld->getMemOperand(),
35892 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35893 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35896 /// If exactly one element of the mask is set for a non-truncating masked store,
35897 /// it is a vector extract and scalar store.
35898 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35899 /// mask have already been optimized in IR, so we don't bother with those here.
35900 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35901 SelectionDAG &DAG) {
35902 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35903 // However, some target hooks may need to be added to know when the transform
35904 // is profitable. Endianness would also have to be considered.
35906 SDValue Addr, VecIndex;
35907 unsigned Alignment;
35908 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35911 // Extract the one scalar element that is actually being stored.
35913 EVT VT = MS->getValue().getValueType();
35914 EVT EltVT = VT.getVectorElementType();
35915 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35916 MS->getValue(), VecIndex);
35918 // Store that element at the appropriate offset from the base pointer.
35919 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35920 Alignment, MS->getMemOperand()->getFlags());
35923 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35924 const X86Subtarget &Subtarget) {
35925 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35927 if (Mst->isCompressingStore())
35930 if (!Mst->isTruncatingStore()) {
35931 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35932 return ScalarStore;
35934 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35935 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35936 // cares about the sign bit of each mask element, so eliminate the compare:
35937 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35938 // Note that by waiting to match an x86-specific PCMPGT node, we're
35939 // eliminating potentially more complex matching of a setcc node which has
35940 // a full range of predicates.
35941 SDValue Mask = Mst->getMask();
35942 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35943 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35944 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35945 "Unexpected type for PCMPGT");
35946 return DAG.getMaskedStore(
35947 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35948 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35951 // TODO: AVX512 targets should also be able to simplify something like the
35952 // pattern above, but that pattern will be different. It will either need to
35953 // match setcc more generally or match PCMPGTM later (in tablegen?).
35958 // Resolve truncating stores.
35959 EVT VT = Mst->getValue().getValueType();
35960 unsigned NumElems = VT.getVectorNumElements();
35961 EVT StVT = Mst->getMemoryVT();
35964 assert(StVT != VT && "Cannot truncate to the same type");
35965 unsigned FromSz = VT.getScalarSizeInBits();
35966 unsigned ToSz = StVT.getScalarSizeInBits();
35968 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35970 // The truncating store is legal in some cases. For example
35971 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35972 // are designated for truncate store.
35973 // In this case we don't need any further transformations.
35974 if (TLI.isTruncStoreLegal(VT, StVT))
35977 // From/To sizes and ElemCount must be pow of two.
35978 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35979 "Unexpected size for truncating masked store");
35980 // We are going to use the original vector elt for storing.
35981 // Accumulated smaller vector elements must be a multiple of the store size.
35982 assert (((NumElems * FromSz) % ToSz) == 0 &&
35983 "Unexpected ratio for truncating masked store");
35985 unsigned SizeRatio = FromSz / ToSz;
35986 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35988 // Create a type on which we perform the shuffle.
35989 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35990 StVT.getScalarType(), NumElems*SizeRatio);
35992 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35994 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35995 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35996 for (unsigned i = 0; i != NumElems; ++i)
35997 ShuffleVec[i] = i * SizeRatio;
35999 // Can't shuffle using an illegal type.
36000 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
36001 "WideVecVT should be legal");
36003 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36004 DAG.getUNDEF(WideVecVT),
36008 SDValue Mask = Mst->getMask();
36009 if (Mask.getValueType() == VT) {
36010 // Mask and original value have the same type.
36011 NewMask = DAG.getBitcast(WideVecVT, Mask);
36012 for (unsigned i = 0; i != NumElems; ++i)
36013 ShuffleVec[i] = i * SizeRatio;
36014 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
36015 ShuffleVec[i] = NumElems*SizeRatio;
36016 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
36017 DAG.getConstant(0, dl, WideVecVT),
36020 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
36021 unsigned WidenNumElts = NumElems*SizeRatio;
36022 unsigned MaskNumElts = VT.getVectorNumElements();
36023 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
36026 unsigned NumConcat = WidenNumElts / MaskNumElts;
36027 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
36028 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
36030 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
36033 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
36034 Mst->getBasePtr(), NewMask, StVT,
36035 Mst->getMemOperand(), false);
36038 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
36039 const X86Subtarget &Subtarget) {
36040 StoreSDNode *St = cast<StoreSDNode>(N);
36041 EVT VT = St->getValue().getValueType();
36042 EVT StVT = St->getMemoryVT();
36044 SDValue StoredVal = St->getOperand(1);
36045 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36047 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
36048 // This will avoid a copy to k-register.
36049 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
36050 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36051 StoredVal.getOperand(0).getValueType() == MVT::i8) {
36052 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
36053 St->getBasePtr(), St->getPointerInfo(),
36054 St->getAlignment(), St->getMemOperand()->getFlags());
36057 // Widen v2i1/v4i1 stores to v8i1.
36058 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
36059 Subtarget.hasAVX512()) {
36060 unsigned NumConcats = 8 / VT.getVectorNumElements();
36061 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
36062 Ops[0] = StoredVal;
36063 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
36064 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36065 St->getPointerInfo(), St->getAlignment(),
36066 St->getMemOperand()->getFlags());
36069 // Turn vXi1 stores of constants into a scalar store.
36070 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
36071 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
36072 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
36073 // If its a v64i1 store without 64-bit support, we need two stores.
36074 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
36075 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
36076 StoredVal->ops().slice(0, 32));
36077 Lo = combinevXi1ConstantToInteger(Lo, DAG);
36078 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
36079 StoredVal->ops().slice(32, 32));
36080 Hi = combinevXi1ConstantToInteger(Hi, DAG);
36082 unsigned Alignment = St->getAlignment();
36084 SDValue Ptr0 = St->getBasePtr();
36085 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
36088 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
36089 Alignment, St->getMemOperand()->getFlags());
36091 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
36092 St->getPointerInfo().getWithOffset(4),
36093 MinAlign(Alignment, 4U),
36094 St->getMemOperand()->getFlags());
36095 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36098 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
36099 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36100 St->getPointerInfo(), St->getAlignment(),
36101 St->getMemOperand()->getFlags());
36104 // If we are saving a concatenation of two XMM registers and 32-byte stores
36105 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
36107 unsigned AddressSpace = St->getAddressSpace();
36108 unsigned Alignment = St->getAlignment();
36109 if (VT.is256BitVector() && StVT == VT &&
36110 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
36111 AddressSpace, Alignment, &Fast) &&
36113 unsigned NumElems = VT.getVectorNumElements();
36117 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
36118 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
36120 SDValue Ptr0 = St->getBasePtr();
36121 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
36124 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
36125 Alignment, St->getMemOperand()->getFlags());
36127 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
36128 St->getPointerInfo().getWithOffset(16),
36129 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
36130 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36133 // Optimize trunc store (of multiple scalars) to shuffle and store.
36134 // First, pack all of the elements in one place. Next, store to memory
36135 // in fewer chunks.
36136 if (St->isTruncatingStore() && VT.isVector()) {
36137 // Check if we can detect an AVG pattern from the truncation. If yes,
36138 // replace the trunc store by a normal store with the result of X86ISD::AVG
36140 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
36142 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
36143 St->getPointerInfo(), St->getAlignment(),
36144 St->getMemOperand()->getFlags());
36146 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36148 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
36150 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
36151 dl, Val, St->getBasePtr(),
36152 St->getMemoryVT(), St->getMemOperand(), DAG);
36153 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
36154 DAG, dl, Subtarget, TLI))
36155 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
36156 dl, Val, St->getBasePtr(),
36157 St->getMemoryVT(), St->getMemOperand(), DAG);
36159 unsigned NumElems = VT.getVectorNumElements();
36160 assert(StVT != VT && "Cannot truncate to the same type");
36161 unsigned FromSz = VT.getScalarSizeInBits();
36162 unsigned ToSz = StVT.getScalarSizeInBits();
36164 // The truncating store is legal in some cases. For example
36165 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
36166 // are designated for truncate store.
36167 // In this case we don't need any further transformations.
36168 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
36171 // From, To sizes and ElemCount must be pow of two
36172 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
36173 // We are going to use the original vector elt for storing.
36174 // Accumulated smaller vector elements must be a multiple of the store size.
36175 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
36177 unsigned SizeRatio = FromSz / ToSz;
36179 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
36181 // Create a type on which we perform the shuffle
36182 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
36183 StVT.getScalarType(), NumElems*SizeRatio);
36185 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
36187 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
36188 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
36189 for (unsigned i = 0; i != NumElems; ++i)
36190 ShuffleVec[i] = i * SizeRatio;
36192 // Can't shuffle using an illegal type.
36193 if (!TLI.isTypeLegal(WideVecVT))
36196 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36197 DAG.getUNDEF(WideVecVT),
36199 // At this point all of the data is stored at the bottom of the
36200 // register. We now need to save it to mem.
36202 // Find the largest store unit
36203 MVT StoreType = MVT::i8;
36204 for (MVT Tp : MVT::integer_valuetypes()) {
36205 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
36209 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
36210 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
36211 (64 <= NumElems * ToSz))
36212 StoreType = MVT::f64;
36214 // Bitcast the original vector into a vector of store-size units
36215 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
36216 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
36217 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
36218 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
36219 SmallVector<SDValue, 8> Chains;
36220 SDValue Ptr = St->getBasePtr();
36222 // Perform one or more big stores into memory.
36223 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
36224 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
36225 StoreType, ShuffWide,
36226 DAG.getIntPtrConstant(i, dl));
36228 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
36229 St->getAlignment(), St->getMemOperand()->getFlags());
36230 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
36231 Chains.push_back(Ch);
36234 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
36237 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
36238 // the FP state in cases where an emms may be missing.
36239 // A preferable solution to the general problem is to figure out the right
36240 // places to insert EMMS. This qualifies as a quick hack.
36242 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
36243 if (VT.getSizeInBits() != 64)
36246 const Function &F = DAG.getMachineFunction().getFunction();
36247 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
36249 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
36250 if ((VT.isVector() ||
36251 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
36252 isa<LoadSDNode>(St->getValue()) &&
36253 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
36254 St->getChain().hasOneUse() && !St->isVolatile()) {
36255 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
36256 SmallVector<SDValue, 8> Ops;
36258 if (!ISD::isNormalLoad(Ld))
36261 // If this is not the MMX case, i.e. we are just turning i64 load/store
36262 // into f64 load/store, avoid the transformation if there are multiple
36263 // uses of the loaded value.
36264 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
36269 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
36270 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
36272 if (Subtarget.is64Bit() || F64IsLegal) {
36273 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
36274 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
36275 Ld->getMemOperand());
36277 // Make sure new load is placed in same chain order.
36278 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
36279 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
36280 St->getMemOperand());
36283 // Otherwise, lower to two pairs of 32-bit loads / stores.
36284 SDValue LoAddr = Ld->getBasePtr();
36285 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
36287 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
36288 Ld->getPointerInfo(), Ld->getAlignment(),
36289 Ld->getMemOperand()->getFlags());
36290 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
36291 Ld->getPointerInfo().getWithOffset(4),
36292 MinAlign(Ld->getAlignment(), 4),
36293 Ld->getMemOperand()->getFlags());
36294 // Make sure new loads are placed in same chain order.
36295 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
36296 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
36298 LoAddr = St->getBasePtr();
36299 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
36302 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
36303 St->getAlignment(), St->getMemOperand()->getFlags());
36304 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
36305 St->getPointerInfo().getWithOffset(4),
36306 MinAlign(St->getAlignment(), 4),
36307 St->getMemOperand()->getFlags());
36308 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
36311 // This is similar to the above case, but here we handle a scalar 64-bit
36312 // integer store that is extracted from a vector on a 32-bit target.
36313 // If we have SSE2, then we can treat it like a floating-point double
36314 // to get past legalization. The execution dependencies fixup pass will
36315 // choose the optimal machine instruction for the store if this really is
36316 // an integer or v2f32 rather than an f64.
36317 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
36318 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
36319 SDValue OldExtract = St->getOperand(1);
36320 SDValue ExtOp0 = OldExtract.getOperand(0);
36321 unsigned VecSize = ExtOp0.getValueSizeInBits();
36322 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
36323 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
36324 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
36325 BitCast, OldExtract.getOperand(1));
36326 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
36327 St->getPointerInfo(), St->getAlignment(),
36328 St->getMemOperand()->getFlags());
36334 /// Return 'true' if this vector operation is "horizontal"
36335 /// and return the operands for the horizontal operation in LHS and RHS. A
36336 /// horizontal operation performs the binary operation on successive elements
36337 /// of its first operand, then on successive elements of its second operand,
36338 /// returning the resulting values in a vector. For example, if
36339 /// A = < float a0, float a1, float a2, float a3 >
36341 /// B = < float b0, float b1, float b2, float b3 >
36342 /// then the result of doing a horizontal operation on A and B is
36343 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
36344 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
36345 /// A horizontal-op B, for some already available A and B, and if so then LHS is
36346 /// set to A, RHS to B, and the routine returns 'true'.
36347 /// Note that the binary operation should have the property that if one of the
36348 /// operands is UNDEF then the result is UNDEF.
36349 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
36350 // Look for the following pattern: if
36351 // A = < float a0, float a1, float a2, float a3 >
36352 // B = < float b0, float b1, float b2, float b3 >
36354 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
36355 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
36356 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
36357 // which is A horizontal-op B.
36359 // At least one of the operands should be a vector shuffle.
36360 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
36361 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
36364 MVT VT = LHS.getSimpleValueType();
36366 assert((VT.is128BitVector() || VT.is256BitVector()) &&
36367 "Unsupported vector type for horizontal add/sub");
36369 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
36370 // operate independently on 128-bit lanes.
36371 unsigned NumElts = VT.getVectorNumElements();
36372 unsigned NumLanes = VT.getSizeInBits()/128;
36373 unsigned NumLaneElts = NumElts / NumLanes;
36374 assert((NumLaneElts % 2 == 0) &&
36375 "Vector type should have an even number of elements in each lane");
36376 unsigned HalfLaneElts = NumLaneElts/2;
36378 // View LHS in the form
36379 // LHS = VECTOR_SHUFFLE A, B, LMask
36380 // If LHS is not a shuffle then pretend it is the shuffle
36381 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
36382 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
36385 SmallVector<int, 16> LMask(NumElts);
36386 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36387 if (!LHS.getOperand(0).isUndef())
36388 A = LHS.getOperand(0);
36389 if (!LHS.getOperand(1).isUndef())
36390 B = LHS.getOperand(1);
36391 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
36392 std::copy(Mask.begin(), Mask.end(), LMask.begin());
36394 if (!LHS.isUndef())
36396 for (unsigned i = 0; i != NumElts; ++i)
36400 // Likewise, view RHS in the form
36401 // RHS = VECTOR_SHUFFLE C, D, RMask
36403 SmallVector<int, 16> RMask(NumElts);
36404 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36405 if (!RHS.getOperand(0).isUndef())
36406 C = RHS.getOperand(0);
36407 if (!RHS.getOperand(1).isUndef())
36408 D = RHS.getOperand(1);
36409 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
36410 std::copy(Mask.begin(), Mask.end(), RMask.begin());
36412 if (!RHS.isUndef())
36414 for (unsigned i = 0; i != NumElts; ++i)
36418 // Check that the shuffles are both shuffling the same vectors.
36419 if (!(A == C && B == D) && !(A == D && B == C))
36422 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
36423 if (!A.getNode() && !B.getNode())
36426 // If A and B occur in reverse order in RHS, then "swap" them (which means
36427 // rewriting the mask).
36429 ShuffleVectorSDNode::commuteMask(RMask);
36431 // At this point LHS and RHS are equivalent to
36432 // LHS = VECTOR_SHUFFLE A, B, LMask
36433 // RHS = VECTOR_SHUFFLE A, B, RMask
36434 // Check that the masks correspond to performing a horizontal operation.
36435 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
36436 for (unsigned i = 0; i != NumLaneElts; ++i) {
36437 int LIdx = LMask[i+l], RIdx = RMask[i+l];
36439 // Ignore any UNDEF components.
36440 if (LIdx < 0 || RIdx < 0 ||
36441 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
36442 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
36445 // Check that successive elements are being operated on. If not, this is
36446 // not a horizontal operation.
36447 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
36448 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
36449 if (!(LIdx == Index && RIdx == Index + 1) &&
36450 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
36455 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
36456 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
36460 /// Do target-specific dag combines on floating-point adds/subs.
36461 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
36462 const X86Subtarget &Subtarget) {
36463 EVT VT = N->getValueType(0);
36464 SDValue LHS = N->getOperand(0);
36465 SDValue RHS = N->getOperand(1);
36466 bool IsFadd = N->getOpcode() == ISD::FADD;
36467 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
36469 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
36470 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
36471 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
36472 isHorizontalBinOp(LHS, RHS, IsFadd)) {
36473 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
36474 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
36479 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
36481 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
36482 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
36483 const X86Subtarget &Subtarget,
36485 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
36486 SDValue Src = N->getOperand(0);
36487 unsigned Opcode = Src.getOpcode();
36488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36490 EVT VT = N->getValueType(0);
36491 EVT SrcVT = Src.getValueType();
36493 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
36494 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
36496 // Repeated operand, so we are only trading one output truncation for
36497 // one input truncation.
36501 // See if either operand has been extended from a smaller/equal size to
36502 // the truncation size, allowing a truncation to combine with the extend.
36503 unsigned Opcode0 = Op0.getOpcode();
36504 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
36505 Opcode0 == ISD::ZERO_EXTEND) &&
36506 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36509 unsigned Opcode1 = Op1.getOpcode();
36510 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
36511 Opcode1 == ISD::ZERO_EXTEND) &&
36512 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36515 // See if either operand is a single use constant which can be constant
36517 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
36518 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
36519 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
36520 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
36523 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
36524 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
36525 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
36526 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
36529 // Don't combine if the operation has other uses.
36530 if (!N->isOnlyUserOf(Src.getNode()))
36533 // Only support vector truncation for now.
36534 // TODO: i64 scalar math would benefit as well.
36535 if (!VT.isVector())
36538 // In most cases its only worth pre-truncating if we're only facing the cost
36539 // of one truncation.
36540 // i.e. if one of the inputs will constant fold or the input is repeated.
36545 SDValue Op0 = Src.getOperand(0);
36546 SDValue Op1 = Src.getOperand(1);
36547 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
36548 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36549 return TruncateArithmetic(Op0, Op1);
36554 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
36555 // better to truncate if we have the chance.
36556 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
36557 !TLI.isOperationLegal(Opcode, SrcVT))
36558 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
36561 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
36562 SDValue Op0 = Src.getOperand(0);
36563 SDValue Op1 = Src.getOperand(1);
36564 if (TLI.isOperationLegal(Opcode, VT) &&
36565 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36566 return TruncateArithmetic(Op0, Op1);
36574 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
36575 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
36576 const X86Subtarget &Subtarget,
36577 SelectionDAG &DAG) {
36578 SDValue In = N->getOperand(0);
36579 EVT InVT = In.getValueType();
36580 EVT InSVT = InVT.getVectorElementType();
36581 EVT OutVT = N->getValueType(0);
36582 EVT OutSVT = OutVT.getVectorElementType();
36584 // Split a long vector into vectors of legal type and mask to unset all bits
36585 // that won't appear in the result to prevent saturation.
36586 // TODO - we should be doing this at the maximum legal size but this is
36587 // causing regressions where we're concatenating back to max width just to
36588 // perform the AND and then extracting back again.....
36589 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
36590 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
36591 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
36592 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
36595 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
36596 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
36598 for (unsigned i = 0; i < NumSubRegs; i++) {
36599 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
36600 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
36601 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
36603 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
36605 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
36608 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
36609 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
36610 const X86Subtarget &Subtarget,
36611 SelectionDAG &DAG) {
36612 SDValue In = N->getOperand(0);
36613 EVT InVT = In.getValueType();
36614 EVT OutVT = N->getValueType(0);
36615 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
36616 DAG.getValueType(OutVT));
36617 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
36620 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
36621 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
36622 /// legalization the truncation will be translated into a BUILD_VECTOR with each
36623 /// element that is extracted from a vector and then truncated, and it is
36624 /// difficult to do this optimization based on them.
36625 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
36626 const X86Subtarget &Subtarget) {
36627 EVT OutVT = N->getValueType(0);
36628 if (!OutVT.isVector())
36631 SDValue In = N->getOperand(0);
36632 if (!In.getValueType().isSimple())
36635 EVT InVT = In.getValueType();
36636 unsigned NumElems = OutVT.getVectorNumElements();
36638 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
36639 // SSE2, and we need to take care of it specially.
36640 // AVX512 provides vpmovdb.
36641 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
36644 EVT OutSVT = OutVT.getVectorElementType();
36645 EVT InSVT = InVT.getVectorElementType();
36646 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
36647 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
36651 // SSSE3's pshufb results in less instructions in the cases below.
36652 if (Subtarget.hasSSSE3() && NumElems == 8 &&
36653 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
36654 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
36658 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
36659 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
36660 // truncate 2 x v4i32 to v8i16.
36661 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
36662 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
36663 if (InSVT == MVT::i32)
36664 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
36669 /// This function transforms vector truncation of 'extended sign-bits' or
36670 /// 'extended zero-bits' values.
36671 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
36672 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
36674 const X86Subtarget &Subtarget) {
36675 // Requires SSE2 but AVX512 has fast truncate.
36676 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36679 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
36682 SDValue In = N->getOperand(0);
36683 if (!In.getValueType().isSimple())
36686 MVT VT = N->getValueType(0).getSimpleVT();
36687 MVT SVT = VT.getScalarType();
36689 MVT InVT = In.getValueType().getSimpleVT();
36690 MVT InSVT = InVT.getScalarType();
36692 // Check we have a truncation suited for PACKSS/PACKUS.
36693 if (!VT.is128BitVector() && !VT.is256BitVector())
36695 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
36697 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
36700 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
36701 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
36703 // Use PACKUS if the input has zero-bits that extend all the way to the
36704 // packed/truncated value. e.g. masks, zext_in_reg, etc.
36706 DAG.computeKnownBits(In, Known);
36707 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
36708 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
36709 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36711 // Use PACKSS if the input has sign-bits that extend all the way to the
36712 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
36713 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
36714 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
36715 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
36720 // Try to form a MULHU or MULHS node by looking for
36721 // (trunc (srl (mul ext, ext), 16))
36722 // TODO: This is X86 specific because we want to be able to handle wide types
36723 // before type legalization. But we can only do it if the vector will be
36724 // legalized via widening/splitting. Type legalization can't handle promotion
36725 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
36727 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
36728 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36729 // First instruction should be a right shift of a multiply.
36730 if (Src.getOpcode() != ISD::SRL ||
36731 Src.getOperand(0).getOpcode() != ISD::MUL)
36734 if (!Subtarget.hasSSE2())
36737 // Only handle vXi16 types that are at least 128-bits.
36738 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
36739 VT.getVectorNumElements() < 8)
36742 // Input type should be vXi32.
36743 EVT InVT = Src.getValueType();
36744 if (InVT.getVectorElementType() != MVT::i32)
36747 // Need a shift by 16.
36749 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
36753 SDValue LHS = Src.getOperand(0).getOperand(0);
36754 SDValue RHS = Src.getOperand(0).getOperand(1);
36756 unsigned ExtOpc = LHS.getOpcode();
36757 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
36758 RHS.getOpcode() != ExtOpc)
36761 // Peek through the extends.
36762 LHS = LHS.getOperand(0);
36763 RHS = RHS.getOperand(0);
36765 // Ensure the input types match.
36766 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
36769 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
36770 return DAG.getNode(Opc, DL, VT, LHS, RHS);
36773 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
36774 // from one vector with signed bytes from another vector, adds together
36775 // adjacent pairs of 16-bit products, and saturates the result before
36776 // truncating to 16-bits.
36778 // Which looks something like this:
36779 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
36780 // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
36781 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
36782 const X86Subtarget &Subtarget,
36784 if (!VT.isVector() || !Subtarget.hasSSSE3())
36787 unsigned NumElems = VT.getVectorNumElements();
36788 EVT ScalarVT = VT.getVectorElementType();
36789 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
36792 SDValue SSatVal = detectSSatPattern(In, VT);
36793 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
36796 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
36797 // of multiplies from even/odd elements.
36798 SDValue N0 = SSatVal.getOperand(0);
36799 SDValue N1 = SSatVal.getOperand(1);
36801 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
36804 SDValue N00 = N0.getOperand(0);
36805 SDValue N01 = N0.getOperand(1);
36806 SDValue N10 = N1.getOperand(0);
36807 SDValue N11 = N1.getOperand(1);
36809 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
36810 // Canonicalize zero_extend to LHS.
36811 if (N01.getOpcode() == ISD::ZERO_EXTEND)
36812 std::swap(N00, N01);
36813 if (N11.getOpcode() == ISD::ZERO_EXTEND)
36814 std::swap(N10, N11);
36816 // Ensure we have a zero_extend and a sign_extend.
36817 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
36818 N01.getOpcode() != ISD::SIGN_EXTEND ||
36819 N10.getOpcode() != ISD::ZERO_EXTEND ||
36820 N11.getOpcode() != ISD::SIGN_EXTEND)
36823 // Peek through the extends.
36824 N00 = N00.getOperand(0);
36825 N01 = N01.getOperand(0);
36826 N10 = N10.getOperand(0);
36827 N11 = N11.getOperand(0);
36829 // Ensure the extend is from vXi8.
36830 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
36831 N01.getValueType().getVectorElementType() != MVT::i8 ||
36832 N10.getValueType().getVectorElementType() != MVT::i8 ||
36833 N11.getValueType().getVectorElementType() != MVT::i8)
36836 // All inputs should be build_vectors.
36837 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
36838 N01.getOpcode() != ISD::BUILD_VECTOR ||
36839 N10.getOpcode() != ISD::BUILD_VECTOR ||
36840 N11.getOpcode() != ISD::BUILD_VECTOR)
36843 // N00/N10 are zero extended. N01/N11 are sign extended.
36845 // For each element, we need to ensure we have an odd element from one vector
36846 // multiplied by the odd element of another vector and the even element from
36847 // one of the same vectors being multiplied by the even element from the
36848 // other vector. So we need to make sure for each element i, this operator
36849 // is being performed:
36850 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
36851 SDValue ZExtIn, SExtIn;
36852 for (unsigned i = 0; i != NumElems; ++i) {
36853 SDValue N00Elt = N00.getOperand(i);
36854 SDValue N01Elt = N01.getOperand(i);
36855 SDValue N10Elt = N10.getOperand(i);
36856 SDValue N11Elt = N11.getOperand(i);
36857 // TODO: Be more tolerant to undefs.
36858 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36859 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36860 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36861 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
36863 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
36864 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
36865 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
36866 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
36867 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
36869 unsigned IdxN00 = ConstN00Elt->getZExtValue();
36870 unsigned IdxN01 = ConstN01Elt->getZExtValue();
36871 unsigned IdxN10 = ConstN10Elt->getZExtValue();
36872 unsigned IdxN11 = ConstN11Elt->getZExtValue();
36873 // Add is commutative so indices can be reordered.
36874 if (IdxN00 > IdxN10) {
36875 std::swap(IdxN00, IdxN10);
36876 std::swap(IdxN01, IdxN11);
36878 // N0 indices be the even element. N1 indices must be the next odd element.
36879 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
36880 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
36882 SDValue N00In = N00Elt.getOperand(0);
36883 SDValue N01In = N01Elt.getOperand(0);
36884 SDValue N10In = N10Elt.getOperand(0);
36885 SDValue N11In = N11Elt.getOperand(0);
36886 // First time we find an input capture it.
36891 if (ZExtIn != N00In || SExtIn != N01In ||
36892 ZExtIn != N10In || SExtIn != N11In)
36896 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
36897 ArrayRef<SDValue> Ops) {
36898 // Shrink by adding truncate nodes and let DAGCombine fold with the
36900 EVT InVT = Ops[0].getValueType();
36901 assert(InVT.getScalarType() == MVT::i8 &&
36902 "Unexpected scalar element type");
36903 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
36904 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
36905 InVT.getVectorNumElements() / 2);
36906 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
36908 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
36912 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36913 const X86Subtarget &Subtarget) {
36914 EVT VT = N->getValueType(0);
36915 SDValue Src = N->getOperand(0);
36918 // Attempt to pre-truncate inputs to arithmetic ops instead.
36919 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36922 // Try to detect AVG pattern first.
36923 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36926 // Try to detect PMADD
36927 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
36930 // Try to combine truncation with signed/unsigned saturation.
36931 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36934 // Try to combine PMULHUW/PMULHW for vXi16.
36935 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
36938 // The bitcast source is a direct mmx result.
36939 // Detect bitcasts between i32 to x86mmx
36940 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36941 SDValue BCSrc = Src.getOperand(0);
36942 if (BCSrc.getValueType() == MVT::x86mmx)
36943 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36946 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36947 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36950 return combineVectorTruncation(N, DAG, Subtarget);
36953 /// Returns the negated value if the node \p N flips sign of FP value.
36955 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36956 /// AVX512F does not have FXOR, so FNEG is lowered as
36957 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36958 /// In this case we go though all bitcasts.
36959 static SDValue isFNEG(SDNode *N) {
36960 if (N->getOpcode() == ISD::FNEG)
36961 return N->getOperand(0);
36963 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36964 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36967 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36968 if (!Op1.getValueType().isFloatingPoint())
36971 // Extract constant bits and see if they are all sign bit masks.
36973 SmallVector<APInt, 16> EltBits;
36974 if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
36975 UndefElts, EltBits, false, false))
36976 if (llvm::all_of(EltBits, [](APInt &I) { return I.isSignMask(); }))
36977 return peekThroughBitcasts(Op.getOperand(0));
36982 /// Do target-specific dag combines on floating point negations.
36983 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36984 const X86Subtarget &Subtarget) {
36985 EVT OrigVT = N->getValueType(0);
36986 SDValue Arg = isFNEG(N);
36987 assert(Arg.getNode() && "N is expected to be an FNEG node");
36989 EVT VT = Arg.getValueType();
36990 EVT SVT = VT.getScalarType();
36993 // Let legalize expand this if it isn't a legal type yet.
36994 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36997 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36998 // use of a constant by performing (-0 - A*B) instead.
36999 // FIXME: Check rounding control flags as well once it becomes available.
37000 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
37001 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
37002 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
37003 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
37004 Arg.getOperand(1), Zero);
37005 return DAG.getBitcast(OrigVT, NewNode);
37008 // If we're negating an FMA node, then we can adjust the
37009 // instruction to include the extra negation.
37010 unsigned NewOpcode = 0;
37011 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
37012 switch (Arg.getOpcode()) {
37013 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
37014 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
37015 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
37016 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
37017 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
37018 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
37019 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
37020 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
37021 // We can't handle scalar intrinsic node here because it would only
37022 // invert one element and not the whole vector. But we could try to handle
37023 // a negation of the lower element only.
37027 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
37028 Arg.getNode()->ops()));
37033 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
37034 const X86Subtarget &Subtarget) {
37035 MVT VT = N->getSimpleValueType(0);
37036 // If we have integer vector types available, use the integer opcodes.
37037 if (VT.isVector() && Subtarget.hasSSE2()) {
37040 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
37042 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
37043 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
37044 unsigned IntOpcode;
37045 switch (N->getOpcode()) {
37046 default: llvm_unreachable("Unexpected FP logic op");
37047 case X86ISD::FOR: IntOpcode = ISD::OR; break;
37048 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
37049 case X86ISD::FAND: IntOpcode = ISD::AND; break;
37050 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
37052 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
37053 return DAG.getBitcast(VT, IntOp);
37059 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
37060 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
37061 if (N->getOpcode() != ISD::XOR)
37064 SDValue LHS = N->getOperand(0);
37065 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
37066 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
37069 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
37070 X86::CondCode(LHS->getConstantOperandVal(0)));
37072 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
37075 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
37076 TargetLowering::DAGCombinerInfo &DCI,
37077 const X86Subtarget &Subtarget) {
37078 // If this is SSE1 only convert to FXOR to avoid scalarization.
37079 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
37080 N->getValueType(0) == MVT::v4i32) {
37081 return DAG.getBitcast(
37082 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
37083 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
37084 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
37087 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
37090 if (DCI.isBeforeLegalizeOps())
37093 if (SDValue SetCC = foldXor1SetCC(N, DAG))
37096 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
37099 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
37103 return combineFneg(N, DAG, Subtarget);
37107 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
37108 TargetLowering::DAGCombinerInfo &DCI,
37109 const X86Subtarget &Subtarget) {
37110 SDValue Op0 = N->getOperand(0);
37111 SDValue Op1 = N->getOperand(1);
37112 EVT VT = N->getValueType(0);
37113 unsigned NumBits = VT.getSizeInBits();
37115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37116 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
37117 !DCI.isBeforeLegalizeOps());
37119 // TODO - Constant Folding.
37120 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37121 // Reduce Cst1 to the bottom 16-bits.
37122 // NOTE: SimplifyDemandedBits won't do this for constants.
37123 const APInt &Val1 = Cst1->getAPIntValue();
37124 APInt MaskedVal1 = Val1 & 0xFFFF;
37125 if (MaskedVal1 != Val1)
37126 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
37127 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
37130 // Only bottom 16-bits of the control bits are required.
37132 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
37133 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
37134 DCI.CommitTargetLoweringOpt(TLO);
37135 return SDValue(N, 0);
37141 static bool isNullFPScalarOrVectorConst(SDValue V) {
37142 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
37145 /// If a value is a scalar FP zero or a vector FP zero (potentially including
37146 /// undefined elements), return a zero constant that may be used to fold away
37147 /// that value. In the case of a vector, the returned constant will not contain
37148 /// undefined elements even if the input parameter does. This makes it suitable
37149 /// to be used as a replacement operand with operations (eg, bitwise-and) where
37150 /// an undef should not propagate.
37151 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
37152 const X86Subtarget &Subtarget) {
37153 if (!isNullFPScalarOrVectorConst(V))
37156 if (V.getValueType().isVector())
37157 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
37162 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
37163 const X86Subtarget &Subtarget) {
37164 SDValue N0 = N->getOperand(0);
37165 SDValue N1 = N->getOperand(1);
37166 EVT VT = N->getValueType(0);
37169 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
37170 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
37171 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
37172 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
37175 auto isAllOnesConstantFP = [](SDValue V) {
37176 if (V.getSimpleValueType().isVector())
37177 return ISD::isBuildVectorAllOnes(V.getNode());
37178 auto *C = dyn_cast<ConstantFPSDNode>(V);
37179 return C && C->getConstantFPValue()->isAllOnesValue();
37182 // fand (fxor X, -1), Y --> fandn X, Y
37183 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
37184 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
37186 // fand X, (fxor Y, -1) --> fandn Y, X
37187 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
37188 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
37193 /// Do target-specific dag combines on X86ISD::FAND nodes.
37194 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
37195 const X86Subtarget &Subtarget) {
37196 // FAND(0.0, x) -> 0.0
37197 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
37200 // FAND(x, 0.0) -> 0.0
37201 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37204 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
37207 return lowerX86FPLogicOp(N, DAG, Subtarget);
37210 /// Do target-specific dag combines on X86ISD::FANDN nodes.
37211 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
37212 const X86Subtarget &Subtarget) {
37213 // FANDN(0.0, x) -> x
37214 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37215 return N->getOperand(1);
37217 // FANDN(x, 0.0) -> 0.0
37218 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37221 return lowerX86FPLogicOp(N, DAG, Subtarget);
37224 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
37225 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
37226 const X86Subtarget &Subtarget) {
37227 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
37229 // F[X]OR(0.0, x) -> x
37230 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37231 return N->getOperand(1);
37233 // F[X]OR(x, 0.0) -> x
37234 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
37235 return N->getOperand(0);
37238 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
37241 return lowerX86FPLogicOp(N, DAG, Subtarget);
37244 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
37245 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
37246 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
37248 // Only perform optimizations if UnsafeMath is used.
37249 if (!DAG.getTarget().Options.UnsafeFPMath)
37252 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
37253 // into FMINC and FMAXC, which are Commutative operations.
37254 unsigned NewOp = 0;
37255 switch (N->getOpcode()) {
37256 default: llvm_unreachable("unknown opcode");
37257 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
37258 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
37261 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
37262 N->getOperand(0), N->getOperand(1));
37265 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
37266 const X86Subtarget &Subtarget) {
37267 if (Subtarget.useSoftFloat())
37270 // TODO: If an operand is already known to be a NaN or not a NaN, this
37271 // should be an optional swap and FMAX/FMIN.
37273 EVT VT = N->getValueType(0);
37274 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
37275 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
37276 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
37279 SDValue Op0 = N->getOperand(0);
37280 SDValue Op1 = N->getOperand(1);
37282 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
37284 // If we don't have to respect NaN inputs, this is a direct translation to x86
37285 // min/max instructions.
37286 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
37287 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
37289 // If we have to respect NaN inputs, this takes at least 3 instructions.
37290 // Favor a library call when operating on a scalar and minimizing code size.
37291 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
37294 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
37295 DAG.getDataLayout(), *DAG.getContext(), VT);
37297 // There are 4 possibilities involving NaN inputs, and these are the required
37301 // ----------------
37302 // Num | Max | Op0 |
37303 // Op0 ----------------
37304 // NaN | Op1 | NaN |
37305 // ----------------
37307 // The SSE FP max/min instructions were not designed for this case, but rather
37309 // Min = Op1 < Op0 ? Op1 : Op0
37310 // Max = Op1 > Op0 ? Op1 : Op0
37312 // So they always return Op0 if either input is a NaN. However, we can still
37313 // use those instructions for fmaxnum by selecting away a NaN input.
37315 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
37316 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
37317 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
37319 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
37320 // are NaN, the NaN value of Op1 is the result.
37321 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
37324 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
37325 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
37326 TargetLowering::DAGCombinerInfo &DCI,
37327 const X86Subtarget &Subtarget) {
37328 // ANDNP(0, x) -> x
37329 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
37330 return N->getOperand(1);
37332 // ANDNP(x, 0) -> 0
37333 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
37334 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
37336 EVT VT = N->getValueType(0);
37338 // Attempt to recursively combine a bitmask ANDNP with shuffles.
37339 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
37341 if (SDValue Res = combineX86ShufflesRecursively(
37342 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
37343 /*HasVarMask*/ false, DAG, Subtarget))
37350 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
37351 TargetLowering::DAGCombinerInfo &DCI) {
37352 SDValue N0 = N->getOperand(0);
37353 SDValue N1 = N->getOperand(1);
37355 // BT ignores high bits in the bit index operand.
37356 unsigned BitWidth = N1.getValueSizeInBits();
37357 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
37358 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
37359 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
37364 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
37365 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
37366 EVT VT = N->getValueType(0);
37368 SDValue N0 = N->getOperand(0);
37369 SDValue N1 = N->getOperand(1);
37370 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37372 if (ExtraVT != MVT::i16)
37375 // Look through single use any_extends.
37376 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
37377 N0 = N0.getOperand(0);
37379 // See if we have a single use cmov.
37380 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
37383 SDValue CMovOp0 = N0.getOperand(0);
37384 SDValue CMovOp1 = N0.getOperand(1);
37386 // Make sure both operands are constants.
37387 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37388 !isa<ConstantSDNode>(CMovOp1.getNode()))
37393 // If we looked through an any_extend above, add one to the constants.
37394 if (N0.getValueType() != VT) {
37395 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
37396 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
37399 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
37400 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
37402 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
37403 N0.getOperand(2), N0.getOperand(3));
37406 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
37407 const X86Subtarget &Subtarget) {
37408 if (SDValue V = combineSextInRegCmov(N, DAG))
37411 EVT VT = N->getValueType(0);
37412 SDValue N0 = N->getOperand(0);
37413 SDValue N1 = N->getOperand(1);
37414 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37417 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
37418 // both SSE and AVX2 since there is no sign-extended shift right
37419 // operation on a vector with 64-bit elements.
37420 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
37421 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
37422 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
37423 N0.getOpcode() == ISD::SIGN_EXTEND)) {
37424 SDValue N00 = N0.getOperand(0);
37426 // EXTLOAD has a better solution on AVX2,
37427 // it may be replaced with X86ISD::VSEXT node.
37428 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
37429 if (!ISD::isNormalLoad(N00.getNode()))
37432 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
37433 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
37435 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
37441 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
37442 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
37443 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
37444 /// opportunities to combine math ops, use an LEA, or use a complex addressing
37445 /// mode. This can eliminate extend, add, and shift instructions.
37446 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
37447 const X86Subtarget &Subtarget) {
37448 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
37449 Ext->getOpcode() != ISD::ZERO_EXTEND)
37452 // TODO: This should be valid for other integer types.
37453 EVT VT = Ext->getValueType(0);
37454 if (VT != MVT::i64)
37457 SDValue Add = Ext->getOperand(0);
37458 if (Add.getOpcode() != ISD::ADD)
37461 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
37462 bool NSW = Add->getFlags().hasNoSignedWrap();
37463 bool NUW = Add->getFlags().hasNoUnsignedWrap();
37465 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
37467 if ((Sext && !NSW) || (!Sext && !NUW))
37470 // Having a constant operand to the 'add' ensures that we are not increasing
37471 // the instruction count because the constant is extended for free below.
37472 // A constant operand can also become the displacement field of an LEA.
37473 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
37477 // Don't make the 'add' bigger if there's no hope of combining it with some
37478 // other 'add' or 'shl' instruction.
37479 // TODO: It may be profitable to generate simpler LEA instructions in place
37480 // of single 'add' instructions, but the cost model for selecting an LEA
37481 // currently has a high threshold.
37482 bool HasLEAPotential = false;
37483 for (auto *User : Ext->uses()) {
37484 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
37485 HasLEAPotential = true;
37489 if (!HasLEAPotential)
37492 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
37493 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
37494 SDValue AddOp0 = Add.getOperand(0);
37495 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
37496 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
37498 // The wider add is guaranteed to not wrap because both operands are
37501 Flags.setNoSignedWrap(NSW);
37502 Flags.setNoUnsignedWrap(NUW);
37503 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
37506 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
37507 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
37508 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
37509 /// extends from AH (which we otherwise need to do contortions to access).
37510 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
37511 SDValue N0 = N->getOperand(0);
37512 auto OpcodeN = N->getOpcode();
37513 auto OpcodeN0 = N0.getOpcode();
37514 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
37515 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
37518 EVT VT = N->getValueType(0);
37519 EVT InVT = N0.getValueType();
37520 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
37521 !(VT == MVT::i32 || VT == MVT::i64))
37524 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
37525 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
37526 : X86ISD::UDIVREM8_ZEXT_HREG;
37527 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
37529 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
37530 // If this was a 64-bit extend, complete it.
37531 if (VT == MVT::i64)
37532 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
37533 return R.getValue(1);
37536 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
37537 // operands and the result of CMOV is not used anywhere else - promote CMOV
37538 // itself instead of promoting its result. This could be beneficial, because:
37539 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
37540 // (or more) pseudo-CMOVs only when they go one-after-another and
37541 // getting rid of result extension code after CMOV will help that.
37542 // 2) Promotion of constant CMOV arguments is free, hence the
37543 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
37544 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
37545 // promotion is also good in terms of code-size.
37546 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
37548 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
37549 SDValue CMovN = Extend->getOperand(0);
37550 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
37553 EVT TargetVT = Extend->getValueType(0);
37554 unsigned ExtendOpcode = Extend->getOpcode();
37557 EVT VT = CMovN.getValueType();
37558 SDValue CMovOp0 = CMovN.getOperand(0);
37559 SDValue CMovOp1 = CMovN.getOperand(1);
37561 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37562 !isa<ConstantSDNode>(CMovOp1.getNode()))
37565 // Only extend to i32 or i64.
37566 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
37569 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
37571 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
37574 // If this a zero extend to i64, we should only extend to i32 and use a free
37575 // zero extend to finish.
37576 EVT ExtendVT = TargetVT;
37577 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
37578 ExtendVT = MVT::i32;
37580 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
37581 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
37583 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
37584 CMovN.getOperand(2), CMovN.getOperand(3));
37586 // Finish extending if needed.
37587 if (ExtendVT != TargetVT)
37588 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
37593 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
37594 // This is more or less the reverse of combineBitcastvxi1.
37596 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
37597 TargetLowering::DAGCombinerInfo &DCI,
37598 const X86Subtarget &Subtarget) {
37599 unsigned Opcode = N->getOpcode();
37600 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
37601 Opcode != ISD::ANY_EXTEND)
37603 if (!DCI.isBeforeLegalizeOps())
37605 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
37608 SDValue N0 = N->getOperand(0);
37609 EVT VT = N->getValueType(0);
37610 EVT SVT = VT.getScalarType();
37611 EVT InSVT = N0.getValueType().getScalarType();
37612 unsigned EltSizeInBits = SVT.getSizeInBits();
37614 // Input type must be extending a bool vector (bit-casted from a scalar
37615 // integer) to legal integer types.
37616 if (!VT.isVector())
37618 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
37620 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
37623 SDValue N00 = N0.getOperand(0);
37624 EVT SclVT = N0.getOperand(0).getValueType();
37625 if (!SclVT.isScalarInteger())
37630 SmallVector<int, 32> ShuffleMask;
37631 unsigned NumElts = VT.getVectorNumElements();
37632 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
37634 // Broadcast the scalar integer to the vector elements.
37635 if (NumElts > EltSizeInBits) {
37636 // If the scalar integer is greater than the vector element size, then we
37637 // must split it down into sub-sections for broadcasting. For example:
37638 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
37639 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
37640 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
37641 unsigned Scale = NumElts / EltSizeInBits;
37643 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
37644 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
37645 Vec = DAG.getBitcast(VT, Vec);
37647 for (unsigned i = 0; i != Scale; ++i)
37648 ShuffleMask.append(EltSizeInBits, i);
37650 // For smaller scalar integers, we can simply any-extend it to the vector
37651 // element size (we don't care about the upper bits) and broadcast it to all
37653 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
37654 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37655 ShuffleMask.append(NumElts, 0);
37657 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
37659 // Now, mask the relevant bit in each element.
37660 SmallVector<SDValue, 32> Bits;
37661 for (unsigned i = 0; i != NumElts; ++i) {
37662 int BitIdx = (i % EltSizeInBits);
37663 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
37664 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
37666 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
37667 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
37669 // Compare against the bitmask and extend the result.
37670 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
37671 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
37672 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
37674 // For SEXT, this is now done, otherwise shift the result down for
37676 if (Opcode == ISD::SIGN_EXTEND)
37678 return DAG.getNode(ISD::SRL, DL, VT, Vec,
37679 DAG.getConstant(EltSizeInBits - 1, DL, VT));
37682 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
37683 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
37684 /// with UNDEFs) of the input to vectors of the same size as the target type
37685 /// which then extends the lowest elements.
37686 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
37687 TargetLowering::DAGCombinerInfo &DCI,
37688 const X86Subtarget &Subtarget) {
37689 unsigned Opcode = N->getOpcode();
37690 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
37692 if (!DCI.isBeforeLegalizeOps())
37694 if (!Subtarget.hasSSE2())
37697 SDValue N0 = N->getOperand(0);
37698 EVT VT = N->getValueType(0);
37699 EVT SVT = VT.getScalarType();
37700 EVT InVT = N0.getValueType();
37701 EVT InSVT = InVT.getScalarType();
37703 // Input type must be a vector and we must be extending legal integer types.
37704 if (!VT.isVector())
37706 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
37708 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
37711 // On AVX2+ targets, if the input/output types are both legal then we will be
37712 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
37713 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
37714 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
37719 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
37720 EVT InVT = N.getValueType();
37721 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
37722 Size / InVT.getScalarSizeInBits());
37723 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
37724 DAG.getUNDEF(InVT));
37726 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
37729 // If target-size is less than 128-bits, extend to a type that would extend
37730 // to 128 bits, extend that and extract the original target vector.
37731 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
37732 unsigned Scale = 128 / VT.getSizeInBits();
37734 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
37735 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
37736 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
37737 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
37738 DAG.getIntPtrConstant(0, DL));
37741 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
37742 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
37743 // Also use this if we don't have SSE41 to allow the legalizer do its job.
37744 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
37745 (VT.is256BitVector() && Subtarget.hasInt256()) ||
37746 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
37747 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
37748 return Opcode == ISD::SIGN_EXTEND
37749 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
37750 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
37753 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
37754 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
37755 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
37756 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
37757 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
37759 SmallVector<SDValue, 8> Opnds;
37760 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
37761 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37762 DAG.getIntPtrConstant(Offset, DL));
37763 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
37764 SrcVec = Opcode == ISD::SIGN_EXTEND
37765 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
37766 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
37767 Opnds.push_back(SrcVec);
37769 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
37772 // On pre-AVX2 targets, split into 128-bit nodes of
37773 // ISD::*_EXTEND_VECTOR_INREG.
37774 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
37775 return SplitAndExtendInReg(128);
37777 // On pre-AVX512 targets, split into 256-bit nodes of
37778 // ISD::*_EXTEND_VECTOR_INREG.
37779 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
37780 return SplitAndExtendInReg(256);
37785 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
37787 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
37788 const X86Subtarget &Subtarget) {
37789 SDValue N0 = N->getOperand(0);
37790 EVT VT = N->getValueType(0);
37793 // Only do this combine with AVX512 for vector extends.
37794 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
37797 // Only combine legal element types.
37798 EVT SVT = VT.getVectorElementType();
37799 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
37800 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
37803 // We can only do this if the vector size in 256 bits or less.
37804 unsigned Size = VT.getSizeInBits();
37808 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
37809 // that's the only integer compares with we have.
37810 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
37811 if (ISD::isUnsignedIntSetCC(CC))
37814 // Only do this combine if the extension will be fully consumed by the setcc.
37815 EVT N00VT = N0.getOperand(0).getValueType();
37816 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
37817 if (Size != MatchingVecType.getSizeInBits())
37820 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
37822 if (N->getOpcode() == ISD::ZERO_EXTEND)
37823 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
37828 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
37829 TargetLowering::DAGCombinerInfo &DCI,
37830 const X86Subtarget &Subtarget) {
37831 SDValue N0 = N->getOperand(0);
37832 EVT VT = N->getValueType(0);
37833 EVT InVT = N0.getValueType();
37836 if (SDValue DivRem8 = getDivRem8(N, DAG))
37839 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37842 if (!DCI.isBeforeLegalizeOps())
37845 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37848 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
37849 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
37850 // Invert and sign-extend a boolean is the same as zero-extend and subtract
37851 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
37852 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
37853 // sext (xor Bool, -1) --> sub (zext Bool), 1
37854 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
37855 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
37858 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37861 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37865 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37868 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37874 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
37877 default: llvm_unreachable("Unexpected opcode");
37878 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
37879 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
37880 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
37881 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
37882 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
37883 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
37884 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
37885 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
37891 default: llvm_unreachable("Unexpected opcode");
37892 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
37893 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
37894 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
37895 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
37896 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
37897 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
37898 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
37899 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
37906 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
37907 const X86Subtarget &Subtarget) {
37909 EVT VT = N->getValueType(0);
37911 // Let legalize expand this if it isn't a legal type yet.
37912 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
37915 EVT ScalarVT = VT.getScalarType();
37916 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
37919 SDValue A = N->getOperand(0);
37920 SDValue B = N->getOperand(1);
37921 SDValue C = N->getOperand(2);
37923 auto invertIfNegative = [&DAG](SDValue &V) {
37924 if (SDValue NegVal = isFNEG(V.getNode())) {
37925 V = DAG.getBitcast(V.getValueType(), NegVal);
37928 // Look through extract_vector_elts. If it comes from an FNEG, create a
37929 // new extract from the FNEG input.
37930 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37931 isNullConstant(V.getOperand(1))) {
37932 if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
37933 NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
37934 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
37935 NegVal, V.getOperand(1));
37943 // Do not convert the passthru input of scalar intrinsics.
37944 // FIXME: We could allow negations of the lower element only.
37945 bool NegA = invertIfNegative(A);
37946 bool NegB = invertIfNegative(B);
37947 bool NegC = invertIfNegative(C);
37949 if (!NegA && !NegB && !NegC)
37952 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
37954 if (N->getNumOperands() == 4)
37955 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37956 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37959 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37960 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37961 const X86Subtarget &Subtarget) {
37963 EVT VT = N->getValueType(0);
37965 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37969 unsigned NewOpcode;
37970 switch (N->getOpcode()) {
37971 default: llvm_unreachable("Unexpected opcode!");
37972 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37973 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37974 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37975 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37978 if (N->getNumOperands() == 4)
37979 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37980 NegVal, N->getOperand(3));
37981 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37985 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37986 TargetLowering::DAGCombinerInfo &DCI,
37987 const X86Subtarget &Subtarget) {
37988 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37989 // (and (i32 x86isd::setcc_carry), 1)
37990 // This eliminates the zext. This transformation is necessary because
37991 // ISD::SETCC is always legalized to i8.
37993 SDValue N0 = N->getOperand(0);
37994 EVT VT = N->getValueType(0);
37996 if (N0.getOpcode() == ISD::AND &&
37998 N0.getOperand(0).hasOneUse()) {
37999 SDValue N00 = N0.getOperand(0);
38000 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
38001 if (!isOneConstant(N0.getOperand(1)))
38003 return DAG.getNode(ISD::AND, dl, VT,
38004 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
38005 N00.getOperand(0), N00.getOperand(1)),
38006 DAG.getConstant(1, dl, VT));
38010 if (N0.getOpcode() == ISD::TRUNCATE &&
38012 N0.getOperand(0).hasOneUse()) {
38013 SDValue N00 = N0.getOperand(0);
38014 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
38015 return DAG.getNode(ISD::AND, dl, VT,
38016 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
38017 N00.getOperand(0), N00.getOperand(1)),
38018 DAG.getConstant(1, dl, VT));
38022 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
38025 if (DCI.isBeforeLegalizeOps())
38026 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
38029 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
38032 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
38036 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
38039 if (SDValue DivRem8 = getDivRem8(N, DAG))
38042 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
38045 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
38051 /// Try to map a 128-bit or larger integer comparison to vector instructions
38052 /// before type legalization splits it up into chunks.
38053 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
38054 const X86Subtarget &Subtarget) {
38055 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
38056 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
38058 // We're looking for an oversized integer equality comparison.
38059 SDValue X = SetCC->getOperand(0);
38060 SDValue Y = SetCC->getOperand(1);
38061 EVT OpVT = X.getValueType();
38062 unsigned OpSize = OpVT.getSizeInBits();
38063 if (!OpVT.isScalarInteger() || OpSize < 128)
38066 // Ignore a comparison with zero because that gets special treatment in
38067 // EmitTest(). But make an exception for the special case of a pair of
38068 // logically-combined vector-sized operands compared to zero. This pattern may
38069 // be generated by the memcmp expansion pass with oversized integer compares
38071 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
38072 X.getOperand(0).getOpcode() == ISD::XOR &&
38073 X.getOperand(1).getOpcode() == ISD::XOR;
38074 if (isNullConstant(Y) && !IsOrXorXorCCZero)
38077 // Bail out if we know that this is not really just an oversized integer.
38078 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
38079 peekThroughBitcasts(Y).getValueType() == MVT::f128)
38082 // TODO: Use PXOR + PTEST for SSE4.1 or later?
38083 // TODO: Add support for AVX-512.
38084 EVT VT = SetCC->getValueType(0);
38086 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
38087 (OpSize == 256 && Subtarget.hasAVX2())) {
38088 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
38090 if (IsOrXorXorCCZero) {
38091 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
38092 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
38093 // Use 2 vector equality compares and 'and' the results before doing a
38095 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
38096 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
38097 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
38098 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
38099 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
38100 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
38101 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
38103 SDValue VecX = DAG.getBitcast(VecVT, X);
38104 SDValue VecY = DAG.getBitcast(VecVT, Y);
38105 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
38107 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
38108 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
38109 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
38110 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
38111 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
38112 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
38113 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
38115 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
38121 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
38122 const X86Subtarget &Subtarget) {
38123 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
38124 SDValue LHS = N->getOperand(0);
38125 SDValue RHS = N->getOperand(1);
38126 EVT VT = N->getValueType(0);
38127 EVT OpVT = LHS.getValueType();
38130 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
38131 // 0-x == y --> x+y == 0
38132 // 0-x != y --> x+y != 0
38133 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
38135 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
38136 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38138 // x == 0-y --> x+y == 0
38139 // x != 0-y --> x+y != 0
38140 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
38142 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
38143 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38146 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
38150 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
38151 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
38152 // Put build_vectors on the right.
38153 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
38154 std::swap(LHS, RHS);
38155 CC = ISD::getSetCCSwappedOperands(CC);
38159 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
38160 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
38161 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
38163 if (IsSEXT0 && IsVZero1) {
38164 assert(VT == LHS.getOperand(0).getValueType() &&
38165 "Uexpected operand type");
38166 if (CC == ISD::SETGT)
38167 return DAG.getConstant(0, DL, VT);
38168 if (CC == ISD::SETLE)
38169 return DAG.getConstant(1, DL, VT);
38170 if (CC == ISD::SETEQ || CC == ISD::SETGE)
38171 return DAG.getNOT(DL, LHS.getOperand(0), VT);
38173 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
38174 "Unexpected condition code!");
38175 return LHS.getOperand(0);
38179 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
38180 // pre-promote its result type since vXi1 vectors don't get promoted
38181 // during type legalization.
38182 // NOTE: The element count check is to ignore operand types that need to
38183 // go through type promotion to a 128-bit vector.
38184 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
38185 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
38186 (OpVT.getVectorElementType() == MVT::i8 ||
38187 OpVT.getVectorElementType() == MVT::i16)) {
38188 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
38190 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
38193 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
38194 // to avoid scalarization via legalization because v4i32 is not a legal type.
38195 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
38196 LHS.getValueType() == MVT::v4f32)
38197 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
38202 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
38203 TargetLowering::DAGCombinerInfo &DCI) {
38204 SDValue Src = N->getOperand(0);
38205 MVT SrcVT = Src.getSimpleValueType();
38207 // Perform constant folding.
38208 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
38209 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
38211 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
38212 SDValue In = Src.getOperand(Idx);
38213 if (!In.isUndef() &&
38214 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
38217 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
38220 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38221 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38222 !DCI.isBeforeLegalizeOps());
38224 // MOVMSK only uses the MSB from each vector element.
38226 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
38227 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
38228 DCI.AddToWorklist(Src.getNode());
38229 DCI.CommitTargetLoweringOpt(TLO);
38230 return SDValue(N, 0);
38236 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
38237 TargetLowering::DAGCombinerInfo &DCI,
38238 const X86Subtarget &Subtarget) {
38241 if (DCI.isBeforeLegalizeOps()) {
38242 SDValue Index = N->getOperand(4);
38243 // Remove any sign extends from 32 or smaller to larger than 32.
38244 // Only do this before LegalizeOps in case we need the sign extend for
38246 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
38247 if (Index.getScalarValueSizeInBits() > 32 &&
38248 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
38249 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38250 NewOps[4] = Index.getOperand(0);
38251 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38253 // The original sign extend has less users, add back to worklist in
38254 // case it needs to be removed
38255 DCI.AddToWorklist(Index.getNode());
38256 DCI.AddToWorklist(N);
38258 return SDValue(Res, 0);
38262 // Make sure the index is either i32 or i64
38263 unsigned ScalarSize = Index.getScalarValueSizeInBits();
38264 if (ScalarSize != 32 && ScalarSize != 64) {
38265 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
38266 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
38267 Index.getValueType().getVectorNumElements());
38268 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
38269 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38271 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38273 DCI.AddToWorklist(N);
38274 return SDValue(Res, 0);
38277 // Try to remove zero extends from 32->64 if we know the sign bit of
38278 // the input is zero.
38279 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
38280 Index.getScalarValueSizeInBits() == 64 &&
38281 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
38282 if (DAG.SignBitIsZero(Index.getOperand(0))) {
38283 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38284 NewOps[4] = Index.getOperand(0);
38285 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38287 // The original sign extend has less users, add back to worklist in
38288 // case it needs to be removed
38289 DCI.AddToWorklist(Index.getNode());
38290 DCI.AddToWorklist(N);
38292 return SDValue(Res, 0);
38297 // With AVX2 we only demand the upper bit of the mask.
38298 if (!Subtarget.hasAVX512()) {
38299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38300 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38301 !DCI.isBeforeLegalizeOps());
38302 SDValue Mask = N->getOperand(2);
38304 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
38305 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
38306 DCI.AddToWorklist(Mask.getNode());
38307 DCI.CommitTargetLoweringOpt(TLO);
38308 return SDValue(N, 0);
38315 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
38316 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
38317 const X86Subtarget &Subtarget) {
38319 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
38320 SDValue EFLAGS = N->getOperand(1);
38322 // Try to simplify the EFLAGS and condition code operands.
38323 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
38324 return getSETCC(CC, Flags, DL, DAG);
38329 /// Optimize branch condition evaluation.
38330 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
38331 const X86Subtarget &Subtarget) {
38333 SDValue EFLAGS = N->getOperand(3);
38334 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
38336 // Try to simplify the EFLAGS and condition code operands.
38337 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
38338 // RAUW them under us.
38339 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
38340 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
38341 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
38342 N->getOperand(1), Cond, Flags);
38348 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
38349 SelectionDAG &DAG) {
38350 // Take advantage of vector comparisons producing 0 or -1 in each lane to
38351 // optimize away operation when it's from a constant.
38353 // The general transformation is:
38354 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
38355 // AND(VECTOR_CMP(x,y), constant2)
38356 // constant2 = UNARYOP(constant)
38358 // Early exit if this isn't a vector operation, the operand of the
38359 // unary operation isn't a bitwise AND, or if the sizes of the operations
38360 // aren't the same.
38361 EVT VT = N->getValueType(0);
38362 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
38363 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
38364 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
38367 // Now check that the other operand of the AND is a constant. We could
38368 // make the transformation for non-constant splats as well, but it's unclear
38369 // that would be a benefit as it would not eliminate any operations, just
38370 // perform one more step in scalar code before moving to the vector unit.
38371 if (BuildVectorSDNode *BV =
38372 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
38373 // Bail out if the vector isn't a constant.
38374 if (!BV->isConstant())
38377 // Everything checks out. Build up the new and improved node.
38379 EVT IntVT = BV->getValueType(0);
38380 // Create a new constant of the appropriate type for the transformed
38382 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
38383 // The AND node needs bitcasts to/from an integer vector type around it.
38384 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
38385 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
38386 N->getOperand(0)->getOperand(0), MaskConst);
38387 SDValue Res = DAG.getBitcast(VT, NewAnd);
38394 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
38395 const X86Subtarget &Subtarget) {
38396 SDValue Op0 = N->getOperand(0);
38397 EVT VT = N->getValueType(0);
38398 EVT InVT = Op0.getValueType();
38400 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38401 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
38402 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
38403 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38405 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38406 InVT.getVectorNumElements());
38407 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
38409 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
38410 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38413 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
38414 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
38415 // the optimization here.
38416 if (DAG.SignBitIsZero(Op0))
38417 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
38422 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
38423 const X86Subtarget &Subtarget) {
38424 // First try to optimize away the conversion entirely when it's
38425 // conditionally from a constant. Vectors only.
38426 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
38429 // Now move on to more general possibilities.
38430 SDValue Op0 = N->getOperand(0);
38431 EVT VT = N->getValueType(0);
38432 EVT InVT = Op0.getValueType();
38434 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38435 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
38436 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
38437 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38439 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38440 InVT.getVectorNumElements());
38441 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
38442 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38445 // Without AVX512DQ we only support i64 to float scalar conversion. For both
38446 // vectors and scalars, see if we know that the upper bits are all the sign
38447 // bit, in which case we can truncate the input to i32 and convert from that.
38448 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
38449 unsigned BitWidth = InVT.getScalarSizeInBits();
38450 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
38451 if (NumSignBits >= (BitWidth - 31)) {
38452 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
38453 if (InVT.isVector())
38454 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
38455 InVT.getVectorNumElements());
38457 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
38458 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
38462 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
38463 // a 32-bit target where SSE doesn't support i64->FP operations.
38464 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
38465 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
38466 EVT LdVT = Ld->getValueType(0);
38468 // This transformation is not supported if the result type is f16 or f128.
38469 if (VT == MVT::f16 || VT == MVT::f128)
38472 // If we have AVX512DQ we can use packed conversion instructions unless
38474 if (Subtarget.hasDQI() && VT != MVT::f80)
38477 if (!Ld->isVolatile() && !VT.isVector() &&
38478 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
38479 !Subtarget.is64Bit() && LdVT == MVT::i64) {
38480 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
38481 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
38482 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
38489 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
38490 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38491 MVT VT = N->getSimpleValueType(0);
38492 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38493 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
38494 N->getOperand(0), N->getOperand(1),
38501 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
38502 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
38503 TargetLowering::DAGCombinerInfo &DCI) {
38504 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
38505 // the result is either zero or one (depending on the input carry bit).
38506 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
38507 if (X86::isZeroNode(N->getOperand(0)) &&
38508 X86::isZeroNode(N->getOperand(1)) &&
38509 // We don't have a good way to replace an EFLAGS use, so only do this when
38511 SDValue(N, 1).use_empty()) {
38513 EVT VT = N->getValueType(0);
38514 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
38515 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
38516 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38517 DAG.getConstant(X86::COND_B, DL,
38520 DAG.getConstant(1, DL, VT));
38521 return DCI.CombineTo(N, Res1, CarryOut);
38524 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38525 MVT VT = N->getSimpleValueType(0);
38526 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38527 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
38528 N->getOperand(0), N->getOperand(1),
38535 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
38536 /// which is more useful than 0/1 in some cases.
38537 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
38539 // "Condition code B" is also known as "the carry flag" (CF).
38540 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
38541 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
38542 MVT VT = N->getSimpleValueType(0);
38544 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
38546 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
38547 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
38550 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
38551 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
38552 /// with CMP+{ADC, SBB}.
38553 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
38554 bool IsSub = N->getOpcode() == ISD::SUB;
38555 SDValue X = N->getOperand(0);
38556 SDValue Y = N->getOperand(1);
38558 // If this is an add, canonicalize a zext operand to the RHS.
38559 // TODO: Incomplete? What if both sides are zexts?
38560 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
38561 Y.getOpcode() != ISD::ZERO_EXTEND)
38564 // Look through a one-use zext.
38565 bool PeekedThroughZext = false;
38566 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
38567 Y = Y.getOperand(0);
38568 PeekedThroughZext = true;
38571 // If this is an add, canonicalize a setcc operand to the RHS.
38572 // TODO: Incomplete? What if both sides are setcc?
38573 // TODO: Should we allow peeking through a zext of the other operand?
38574 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
38575 Y.getOpcode() != X86ISD::SETCC)
38578 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
38582 EVT VT = N->getValueType(0);
38583 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
38585 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38586 // the general case below.
38587 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
38589 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
38590 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
38591 // This is a complicated way to get -1 or 0 from the carry flag:
38592 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38593 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38594 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38595 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38599 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
38600 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
38601 SDValue EFLAGS = Y->getOperand(1);
38602 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38603 EFLAGS.getValueType().isInteger() &&
38604 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38605 // Swap the operands of a SUB, and we have the same pattern as above.
38606 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
38607 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
38608 SDValue NewSub = DAG.getNode(
38609 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
38610 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38611 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38612 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38613 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38619 if (CC == X86::COND_B) {
38620 // X + SETB Z --> X + (mask SBB Z, Z)
38621 // X - SETB Z --> X - (mask SBB Z, Z)
38622 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
38623 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
38624 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38625 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38626 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38629 if (CC == X86::COND_A) {
38630 SDValue EFLAGS = Y->getOperand(1);
38631 // Try to convert COND_A into COND_B in an attempt to facilitate
38632 // materializing "setb reg".
38634 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
38635 // cannot take an immediate as its first operand.
38637 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38638 EFLAGS.getValueType().isInteger() &&
38639 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38640 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
38641 EFLAGS.getNode()->getVTList(),
38642 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38643 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38644 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
38645 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38646 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38647 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38651 if (CC != X86::COND_E && CC != X86::COND_NE)
38654 SDValue Cmp = Y.getOperand(1);
38655 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
38656 !X86::isZeroNode(Cmp.getOperand(1)) ||
38657 !Cmp.getOperand(0).getValueType().isInteger())
38660 SDValue Z = Cmp.getOperand(0);
38661 EVT ZVT = Z.getValueType();
38663 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38664 // the general case below.
38666 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
38668 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
38669 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
38670 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
38671 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
38672 SDValue Zero = DAG.getConstant(0, DL, ZVT);
38673 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
38674 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
38675 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38676 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38677 SDValue(Neg.getNode(), 1));
38680 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
38681 // with fake operands:
38682 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
38683 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
38684 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
38685 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
38686 SDValue One = DAG.getConstant(1, DL, ZVT);
38687 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38688 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38689 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
38693 // (cmp Z, 1) sets the carry flag if Z is 0.
38694 SDValue One = DAG.getConstant(1, DL, ZVT);
38695 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38697 // Add the flags type for ADC/SBB nodes.
38698 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38700 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
38701 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
38702 if (CC == X86::COND_NE)
38703 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
38704 DAG.getConstant(-1ULL, DL, VT), Cmp1);
38706 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
38707 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
38708 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
38709 DAG.getConstant(0, DL, VT), Cmp1);
38712 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
38713 const X86Subtarget &Subtarget) {
38714 if (!Subtarget.hasSSE2())
38717 SDValue MulOp = N->getOperand(0);
38718 SDValue Phi = N->getOperand(1);
38720 if (MulOp.getOpcode() != ISD::MUL)
38721 std::swap(MulOp, Phi);
38722 if (MulOp.getOpcode() != ISD::MUL)
38726 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
38729 EVT VT = N->getValueType(0);
38731 // If the vector size is less than 128, or greater than the supported RegSize,
38732 // do not use PMADD.
38733 if (VT.getVectorNumElements() < 8)
38737 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38738 VT.getVectorNumElements());
38739 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38740 VT.getVectorNumElements() / 2);
38742 // Shrink the operands of mul.
38743 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
38744 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
38746 // Madd vector size is half of the original vector size
38747 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38748 ArrayRef<SDValue> Ops) {
38749 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38750 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
38752 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
38754 // Fill the rest of the output with 0
38755 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
38756 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
38757 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
38760 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
38761 const X86Subtarget &Subtarget) {
38762 if (!Subtarget.hasSSE2())
38766 EVT VT = N->getValueType(0);
38767 SDValue Op0 = N->getOperand(0);
38768 SDValue Op1 = N->getOperand(1);
38770 // TODO: There's nothing special about i32, any integer type above i16 should
38771 // work just as well.
38772 if (!VT.isVector() || !VT.isSimple() ||
38773 !(VT.getVectorElementType() == MVT::i32))
38776 unsigned RegSize = 128;
38777 if (Subtarget.useBWIRegs())
38779 else if (Subtarget.hasAVX())
38782 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
38783 // TODO: We should be able to handle larger vectors by splitting them before
38784 // feeding them into several SADs, and then reducing over those.
38785 if (VT.getSizeInBits() / 4 > RegSize)
38788 // We know N is a reduction add, which means one of its operands is a phi.
38789 // To match SAD, we need the other operand to be a vector select.
38790 SDValue SelectOp, Phi;
38791 if (Op0.getOpcode() == ISD::VSELECT) {
38794 } else if (Op1.getOpcode() == ISD::VSELECT) {
38800 // Check whether we have an abs-diff pattern feeding into the select.
38801 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
38804 // SAD pattern detected. Now build a SAD instruction and an addition for
38805 // reduction. Note that the number of elements of the result of SAD is less
38806 // than the number of elements of its input. Therefore, we could only update
38807 // part of elements in the reduction vector.
38808 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
38810 // The output of PSADBW is a vector of i64.
38811 // We need to turn the vector of i64 into a vector of i32.
38812 // If the reduction vector is at least as wide as the psadbw result, just
38813 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
38815 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
38816 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
38817 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
38819 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
38821 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
38822 // Fill the upper elements with zero to match the add width.
38823 SDValue Zero = DAG.getConstant(0, DL, VT);
38824 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
38825 DAG.getIntPtrConstant(0, DL));
38828 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
38831 /// Convert vector increment or decrement to sub/add with an all-ones constant:
38832 /// add X, <1, 1...> --> sub X, <-1, -1...>
38833 /// sub X, <1, 1...> --> add X, <-1, -1...>
38834 /// The all-ones vector constant can be materialized using a pcmpeq instruction
38835 /// that is commonly recognized as an idiom (has no register dependency), so
38836 /// that's better/smaller than loading a splat 1 constant.
38837 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
38838 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
38839 "Unexpected opcode for increment/decrement transform");
38841 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
38842 // out and wait for legalization if we have an unsupported vector length.
38843 EVT VT = N->getValueType(0);
38844 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38847 SDNode *N1 = N->getOperand(1).getNode();
38849 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
38850 !SplatVal.isOneValue())
38853 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
38854 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
38855 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
38858 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
38859 const SDLoc &DL, EVT VT,
38860 const X86Subtarget &Subtarget) {
38861 // Example of pattern we try to detect:
38862 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
38863 //(add (build_vector (extract_elt t, 0),
38864 // (extract_elt t, 2),
38865 // (extract_elt t, 4),
38866 // (extract_elt t, 6)),
38867 // (build_vector (extract_elt t, 1),
38868 // (extract_elt t, 3),
38869 // (extract_elt t, 5),
38870 // (extract_elt t, 7)))
38872 if (!Subtarget.hasSSE2())
38875 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
38876 Op1.getOpcode() != ISD::BUILD_VECTOR)
38879 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38880 VT.getVectorNumElements() < 4 ||
38881 !isPowerOf2_32(VT.getVectorNumElements()))
38884 // Check if one of Op0,Op1 is of the form:
38885 // (build_vector (extract_elt Mul, 0),
38886 // (extract_elt Mul, 2),
38887 // (extract_elt Mul, 4),
38889 // the other is of the form:
38890 // (build_vector (extract_elt Mul, 1),
38891 // (extract_elt Mul, 3),
38892 // (extract_elt Mul, 5),
38894 // and identify Mul.
38896 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
38897 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
38898 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
38899 // TODO: Be more tolerant to undefs.
38900 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38901 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38902 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38903 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38905 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
38906 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38907 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38908 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38909 if (!Const0L || !Const1L || !Const0H || !Const1H)
38911 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38912 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38913 // Commutativity of mul allows factors of a product to reorder.
38915 std::swap(Idx0L, Idx1L);
38917 std::swap(Idx0H, Idx1H);
38918 // Commutativity of add allows pairs of factors to reorder.
38919 if (Idx0L > Idx0H) {
38920 std::swap(Idx0L, Idx0H);
38921 std::swap(Idx1L, Idx1H);
38923 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38924 Idx1H != 2 * i + 3)
38927 // First time an extract_elt's source vector is visited. Must be a MUL
38928 // with 2X number of vector elements than the BUILD_VECTOR.
38929 // Both extracts must be from same MUL.
38930 Mul = Op0L->getOperand(0);
38931 if (Mul->getOpcode() != ISD::MUL ||
38932 Mul.getValueType().getVectorNumElements() != 2 * e)
38935 // Check that the extract is from the same MUL previously seen.
38936 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38937 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38941 // Check if the Mul source can be safely shrunk.
38943 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38946 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38947 ArrayRef<SDValue> Ops) {
38948 // Shrink by adding truncate nodes and let DAGCombine fold with the
38950 EVT InVT = Ops[0].getValueType();
38951 assert(InVT.getScalarType() == MVT::i32 &&
38952 "Unexpected scalar element type");
38953 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38954 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38955 InVT.getVectorNumElements() / 2);
38956 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38957 InVT.getVectorNumElements());
38958 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38959 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38960 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38962 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38963 { Mul.getOperand(0), Mul.getOperand(1) },
38967 // Attempt to turn this pattern into PMADDWD.
38968 // (mul (add (zext (build_vector)), (zext (build_vector))),
38969 // (add (zext (build_vector)), (zext (build_vector)))
38970 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
38971 const SDLoc &DL, EVT VT,
38972 const X86Subtarget &Subtarget) {
38973 if (!Subtarget.hasSSE2())
38976 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
38979 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38980 VT.getVectorNumElements() < 4 ||
38981 !isPowerOf2_32(VT.getVectorNumElements()))
38984 SDValue N00 = N0.getOperand(0);
38985 SDValue N01 = N0.getOperand(1);
38986 SDValue N10 = N1.getOperand(0);
38987 SDValue N11 = N1.getOperand(1);
38989 // All inputs need to be sign extends.
38990 // TODO: Support ZERO_EXTEND from known positive?
38991 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
38992 N01.getOpcode() != ISD::SIGN_EXTEND ||
38993 N10.getOpcode() != ISD::SIGN_EXTEND ||
38994 N11.getOpcode() != ISD::SIGN_EXTEND)
38997 // Peek through the extends.
38998 N00 = N00.getOperand(0);
38999 N01 = N01.getOperand(0);
39000 N10 = N10.getOperand(0);
39001 N11 = N11.getOperand(0);
39003 // Must be extending from vXi16.
39004 EVT InVT = N00.getValueType();
39005 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
39006 N10.getValueType() != InVT || N11.getValueType() != InVT)
39009 // All inputs should be build_vectors.
39010 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
39011 N01.getOpcode() != ISD::BUILD_VECTOR ||
39012 N10.getOpcode() != ISD::BUILD_VECTOR ||
39013 N11.getOpcode() != ISD::BUILD_VECTOR)
39016 // For each element, we need to ensure we have an odd element from one vector
39017 // multiplied by the odd element of another vector and the even element from
39018 // one of the same vectors being multiplied by the even element from the
39019 // other vector. So we need to make sure for each element i, this operator
39020 // is being performed:
39021 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
39023 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
39024 SDValue N00Elt = N00.getOperand(i);
39025 SDValue N01Elt = N01.getOperand(i);
39026 SDValue N10Elt = N10.getOperand(i);
39027 SDValue N11Elt = N11.getOperand(i);
39028 // TODO: Be more tolerant to undefs.
39029 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39030 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39031 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39032 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
39034 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
39035 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
39036 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
39037 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
39038 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
39040 unsigned IdxN00 = ConstN00Elt->getZExtValue();
39041 unsigned IdxN01 = ConstN01Elt->getZExtValue();
39042 unsigned IdxN10 = ConstN10Elt->getZExtValue();
39043 unsigned IdxN11 = ConstN11Elt->getZExtValue();
39044 // Add is commutative so indices can be reordered.
39045 if (IdxN00 > IdxN10) {
39046 std::swap(IdxN00, IdxN10);
39047 std::swap(IdxN01, IdxN11);
39049 // N0 indices be the even element. N1 indices must be the next odd element.
39050 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
39051 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
39053 SDValue N00In = N00Elt.getOperand(0);
39054 SDValue N01In = N01Elt.getOperand(0);
39055 SDValue N10In = N10Elt.getOperand(0);
39056 SDValue N11In = N11Elt.getOperand(0);
39057 // First time we find an input capture it.
39062 // Mul is commutative so the input vectors can be in any order.
39063 // Canonicalize to make the compares easier.
39065 std::swap(N00In, N01In);
39067 std::swap(N10In, N11In);
39068 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
39072 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39073 ArrayRef<SDValue> Ops) {
39074 // Shrink by adding truncate nodes and let DAGCombine fold with the
39076 EVT InVT = Ops[0].getValueType();
39077 assert(InVT.getScalarType() == MVT::i16 &&
39078 "Unexpected scalar element type");
39079 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
39080 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
39081 InVT.getVectorNumElements() / 2);
39082 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
39084 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
39088 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
39089 const X86Subtarget &Subtarget) {
39090 const SDNodeFlags Flags = N->getFlags();
39091 if (Flags.hasVectorReduction()) {
39092 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
39094 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
39097 EVT VT = N->getValueType(0);
39098 SDValue Op0 = N->getOperand(0);
39099 SDValue Op1 = N->getOperand(1);
39101 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
39103 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
39106 // Try to synthesize horizontal adds from adds of shuffles.
39107 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
39108 VT == MVT::v8i32) &&
39109 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
39110 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39111 ArrayRef<SDValue> Ops) {
39112 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
39114 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
39118 if (SDValue V = combineIncDecVector(N, DAG))
39121 return combineAddOrSubToADCOrSBB(N, DAG);
39124 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
39125 const X86Subtarget &Subtarget) {
39126 SDValue Op0 = N->getOperand(0);
39127 SDValue Op1 = N->getOperand(1);
39128 EVT VT = N->getValueType(0);
39130 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
39131 // is only worth it with SSSE3 (PSHUFB).
39132 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
39133 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
39134 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
39135 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
39136 VT == MVT::v16i32 || VT == MVT::v8i64)))
39139 SDValue SubusLHS, SubusRHS;
39140 // Try to find umax(a,b) - b or a - umin(a,b) patterns
39141 // they may be converted to subus(a,b).
39142 // TODO: Need to add IR canonicalization for this code.
39143 if (Op0.getOpcode() == ISD::UMAX) {
39145 SDValue MaxLHS = Op0.getOperand(0);
39146 SDValue MaxRHS = Op0.getOperand(1);
39149 else if (MaxRHS == Op1)
39153 } else if (Op1.getOpcode() == ISD::UMIN) {
39155 SDValue MinLHS = Op1.getOperand(0);
39156 SDValue MinRHS = Op1.getOperand(1);
39159 else if (MinRHS == Op0)
39166 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39167 ArrayRef<SDValue> Ops) {
39168 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
39171 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
39172 // special preprocessing in some cases.
39173 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
39174 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
39175 { SubusLHS, SubusRHS }, SUBUSBuilder);
39177 // Special preprocessing case can be only applied
39178 // if the value was zero extended from 16 bit,
39179 // so we require first 16 bits to be zeros for 32 bit
39180 // values, or first 48 bits for 64 bit values.
39182 DAG.computeKnownBits(SubusLHS, Known);
39183 unsigned NumZeros = Known.countMinLeadingZeros();
39184 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
39187 EVT ExtType = SubusLHS.getValueType();
39189 if (VT == MVT::v8i32 || VT == MVT::v8i64)
39190 ShrinkedType = MVT::v8i16;
39192 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
39194 // If SubusLHS is zeroextended - truncate SubusRHS to it's
39195 // size SubusRHS = umin(0xFFF.., SubusRHS).
39196 SDValue SaturationConst =
39197 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
39198 ShrinkedType.getScalarSizeInBits()),
39199 SDLoc(SubusLHS), ExtType);
39200 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
39202 SDValue NewSubusLHS =
39203 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
39204 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
39206 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
39207 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
39208 // Zero extend the result, it may be used somewhere as 32 bit,
39209 // if not zext and following trunc will shrink.
39210 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
39213 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
39214 const X86Subtarget &Subtarget) {
39215 SDValue Op0 = N->getOperand(0);
39216 SDValue Op1 = N->getOperand(1);
39218 // X86 can't encode an immediate LHS of a sub. See if we can push the
39219 // negation into a preceding instruction.
39220 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
39221 // If the RHS of the sub is a XOR with one use and a constant, invert the
39222 // immediate. Then add one to the LHS of the sub so we can turn
39223 // X-Y -> X+~Y+1, saving one register.
39224 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
39225 isa<ConstantSDNode>(Op1.getOperand(1))) {
39226 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
39227 EVT VT = Op0.getValueType();
39228 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
39230 DAG.getConstant(~XorC, SDLoc(Op1), VT));
39231 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
39232 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
39236 // Try to synthesize horizontal subs from subs of shuffles.
39237 EVT VT = N->getValueType(0);
39238 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
39239 VT == MVT::v8i32) &&
39240 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
39241 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39242 ArrayRef<SDValue> Ops) {
39243 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
39245 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
39249 if (SDValue V = combineIncDecVector(N, DAG))
39252 // Try to create PSUBUS if SUB's argument is max/min
39253 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
39256 return combineAddOrSubToADCOrSBB(N, DAG);
39259 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
39260 TargetLowering::DAGCombinerInfo &DCI,
39261 const X86Subtarget &Subtarget) {
39262 if (DCI.isBeforeLegalize())
39266 unsigned Opcode = N->getOpcode();
39267 MVT VT = N->getSimpleValueType(0);
39268 MVT SVT = VT.getVectorElementType();
39269 unsigned NumElts = VT.getVectorNumElements();
39270 unsigned EltSizeInBits = SVT.getSizeInBits();
39272 SDValue Op = N->getOperand(0);
39273 MVT OpVT = Op.getSimpleValueType();
39274 MVT OpEltVT = OpVT.getVectorElementType();
39275 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
39276 unsigned InputBits = OpEltSizeInBits * NumElts;
39278 // Perform any constant folding.
39279 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
39281 SmallVector<APInt, 64> EltBits;
39282 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
39283 APInt Undefs(NumElts, 0);
39284 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
39286 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
39287 for (unsigned i = 0; i != NumElts; ++i) {
39288 if (UndefElts[i]) {
39292 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
39293 : EltBits[i].sextOrTrunc(EltSizeInBits);
39295 return getConstVector(Vals, Undefs, VT, DAG, DL);
39298 // (vzext (bitcast (vzext (x)) -> (vzext x)
39299 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
39300 SDValue V = peekThroughBitcasts(Op);
39301 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
39302 MVT InnerVT = V.getSimpleValueType();
39303 MVT InnerEltVT = InnerVT.getVectorElementType();
39305 // If the element sizes match exactly, we can just do one larger vzext. This
39306 // is always an exact type match as vzext operates on integer types.
39307 if (OpEltVT == InnerEltVT) {
39308 assert(OpVT == InnerVT && "Types must match for vzext!");
39309 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
39312 // The only other way we can combine them is if only a single element of the
39313 // inner vzext is used in the input to the outer vzext.
39314 if (InnerEltVT.getSizeInBits() < InputBits)
39317 // In this case, the inner vzext is completely dead because we're going to
39318 // only look at bits inside of the low element. Just do the outer vzext on
39319 // a bitcast of the input to the inner.
39320 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
39323 // Check if we can bypass extracting and re-inserting an element of an input
39324 // vector. Essentially:
39325 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
39326 // TODO: Add X86ISD::VSEXT support
39327 if (Opcode == X86ISD::VZEXT &&
39328 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39329 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39330 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
39331 SDValue ExtractedV = V.getOperand(0);
39332 SDValue OrigV = ExtractedV.getOperand(0);
39333 if (isNullConstant(ExtractedV.getOperand(1))) {
39334 MVT OrigVT = OrigV.getSimpleValueType();
39335 // Extract a subvector if necessary...
39336 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
39337 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
39338 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
39339 OrigVT.getVectorNumElements() / Ratio);
39340 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
39341 DAG.getIntPtrConstant(0, DL));
39343 Op = DAG.getBitcast(OpVT, OrigV);
39344 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
39351 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
39352 const X86Subtarget &Subtarget) {
39353 MVT VT = N->getSimpleValueType(0);
39356 if (N->getOperand(0) == N->getOperand(1)) {
39357 if (N->getOpcode() == X86ISD::PCMPEQ)
39358 return getOnesVector(VT, DAG, DL);
39359 if (N->getOpcode() == X86ISD::PCMPGT)
39360 return getZeroVector(VT, Subtarget, DAG, DL);
39366 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
39367 TargetLowering::DAGCombinerInfo &DCI,
39368 const X86Subtarget &Subtarget) {
39369 if (DCI.isBeforeLegalizeOps())
39372 MVT OpVT = N->getSimpleValueType(0);
39374 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
39377 SDValue Vec = N->getOperand(0);
39378 SDValue SubVec = N->getOperand(1);
39380 unsigned IdxVal = N->getConstantOperandVal(2);
39381 MVT SubVecVT = SubVec.getSimpleValueType();
39383 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
39384 // Inserting zeros into zeros is a nop.
39385 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39386 return getZeroVector(OpVT, Subtarget, DAG, dl);
39388 // If we're inserting into a zero vector and then into a larger zero vector,
39389 // just insert into the larger zero vector directly.
39390 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39391 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
39392 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
39393 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39394 getZeroVector(OpVT, Subtarget, DAG, dl),
39395 SubVec.getOperand(1),
39396 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
39399 // If we're inserting into a zero vector and our input was extracted from an
39400 // insert into a zero vector of the same type and the extraction was at
39401 // least as large as the original insertion. Just insert the original
39402 // subvector into a zero vector.
39403 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
39404 SubVec.getConstantOperandVal(1) == 0 &&
39405 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
39406 SDValue Ins = SubVec.getOperand(0);
39407 if (Ins.getConstantOperandVal(2) == 0 &&
39408 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
39409 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
39410 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39411 getZeroVector(OpVT, Subtarget, DAG, dl),
39412 Ins.getOperand(1), N->getOperand(2));
39415 // If we're inserting a bitcast into zeros, rewrite the insert and move the
39416 // bitcast to the other side. This helps with detecting zero extending
39418 // TODO: Is this useful for other indices than 0?
39419 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
39420 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
39421 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
39422 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
39423 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
39424 DAG.getBitcast(NewVT, Vec),
39425 SubVec.getOperand(0), N->getOperand(2));
39426 return DAG.getBitcast(OpVT, Insert);
39430 // Stop here if this is an i1 vector.
39434 // If this is an insert of an extract, combine to a shuffle. Don't do this
39435 // if the insert or extract can be represented with a subregister operation.
39436 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39437 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
39438 (IdxVal != 0 || !Vec.isUndef())) {
39439 int ExtIdxVal = SubVec.getConstantOperandVal(1);
39440 if (ExtIdxVal != 0) {
39441 int VecNumElts = OpVT.getVectorNumElements();
39442 int SubVecNumElts = SubVecVT.getVectorNumElements();
39443 SmallVector<int, 64> Mask(VecNumElts);
39444 // First create an identity shuffle mask.
39445 for (int i = 0; i != VecNumElts; ++i)
39447 // Now insert the extracted portion.
39448 for (int i = 0; i != SubVecNumElts; ++i)
39449 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
39451 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
39455 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
39457 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39458 // (load16 addr + 16), Elts/2)
39461 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39462 // (load32 addr + 32), Elts/2)
39464 // or a 16-byte or 32-byte broadcast:
39465 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39466 // (load16 addr), Elts/2)
39467 // --> X86SubVBroadcast(load16 addr)
39469 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39470 // (load32 addr), Elts/2)
39471 // --> X86SubVBroadcast(load32 addr)
39472 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
39473 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39474 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
39475 if (isNullConstant(Vec.getOperand(2))) {
39476 SDValue SubVec2 = Vec.getOperand(1);
39477 // If needed, look through bitcasts to get to the load.
39478 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
39480 unsigned Alignment = FirstLd->getAlignment();
39481 unsigned AS = FirstLd->getAddressSpace();
39482 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
39483 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
39484 OpVT, AS, Alignment, &Fast) && Fast) {
39485 SDValue Ops[] = {SubVec2, SubVec};
39486 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
39491 // If lower/upper loads are the same and the only users of the load, then
39492 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
39493 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
39494 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
39495 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
39496 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
39498 // If this is subv_broadcast insert into both halves, use a larger
39500 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
39501 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
39502 SubVec.getOperand(0));
39504 // If we're inserting all zeros into the upper half, change this to
39505 // an insert into an all zeros vector. We will match this to a move
39506 // with implicit upper bit zeroing during isel.
39507 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39508 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39509 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
39510 Vec.getOperand(2));
39512 // If we are inserting into both halves of the vector, the starting
39513 // vector should be undef. If it isn't, make it so. Only do this if the
39514 // the early insert has no other uses.
39515 // TODO: Should this be a generic DAG combine?
39516 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
39517 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
39518 SubVec2, Vec.getOperand(2));
39519 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
39529 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
39530 TargetLowering::DAGCombinerInfo &DCI,
39531 const X86Subtarget &Subtarget) {
39532 if (DCI.isBeforeLegalizeOps())
39535 MVT OpVT = N->getSimpleValueType(0);
39536 SDValue InVec = N->getOperand(0);
39537 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
39539 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
39540 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
39542 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
39543 if (OpVT.getScalarType() == MVT::i1)
39544 return DAG.getConstant(1, SDLoc(N), OpVT);
39545 return getOnesVector(OpVT, DAG, SDLoc(N));
39548 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
39549 return DAG.getBuildVector(
39551 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
39553 // If we're extracting the lowest subvector and we're the only user,
39554 // we may be able to perform this with a smaller vector width.
39555 if (IdxVal == 0 && InVec.hasOneUse()) {
39556 unsigned InOpcode = InVec.getOpcode();
39557 if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
39558 // v2f64 CVTDQ2PD(v4i32).
39559 if (InOpcode == ISD::SINT_TO_FP &&
39560 InVec.getOperand(0).getValueType() == MVT::v4i32) {
39561 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
39563 // v2f64 CVTPS2PD(v4f32).
39564 if (InOpcode == ISD::FP_EXTEND &&
39565 InVec.getOperand(0).getValueType() == MVT::v4f32) {
39566 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
39569 if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
39570 OpVT.is128BitVector() &&
39571 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
39572 unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
39573 : ISD::SIGN_EXTEND_VECTOR_INREG;
39574 return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
39581 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
39582 EVT VT = N->getValueType(0);
39583 SDValue Src = N->getOperand(0);
39585 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
39586 // This occurs frequently in our masked scalar intrinsic code and our
39587 // floating point select lowering with AVX512.
39588 // TODO: SimplifyDemandedBits instead?
39589 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
39590 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39591 if (C->getAPIntValue().isOneValue())
39592 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39593 Src.getOperand(0));
39598 // Simplify PMULDQ and PMULUDQ operations.
39599 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
39600 TargetLowering::DAGCombinerInfo &DCI) {
39601 SDValue LHS = N->getOperand(0);
39602 SDValue RHS = N->getOperand(1);
39604 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39605 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
39606 !DCI.isBeforeLegalizeOps());
39607 APInt DemandedMask(APInt::getLowBitsSet(64, 32));
39609 // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
39610 KnownBits LHSKnown;
39611 if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
39612 DCI.CommitTargetLoweringOpt(TLO);
39613 return SDValue(N, 0);
39616 KnownBits RHSKnown;
39617 if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
39618 DCI.CommitTargetLoweringOpt(TLO);
39619 return SDValue(N, 0);
39625 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
39626 DAGCombinerInfo &DCI) const {
39627 SelectionDAG &DAG = DCI.DAG;
39628 switch (N->getOpcode()) {
39630 case ISD::SCALAR_TO_VECTOR:
39631 return combineScalarToVector(N, DAG);
39632 case ISD::EXTRACT_VECTOR_ELT:
39633 case X86ISD::PEXTRW:
39634 case X86ISD::PEXTRB:
39635 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
39636 case ISD::INSERT_SUBVECTOR:
39637 return combineInsertSubvector(N, DAG, DCI, Subtarget);
39638 case ISD::EXTRACT_SUBVECTOR:
39639 return combineExtractSubvector(N, DAG, DCI, Subtarget);
39642 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
39643 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
39644 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
39645 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
39646 case ISD::SUB: return combineSub(N, DAG, Subtarget);
39647 case X86ISD::SBB: return combineSBB(N, DAG);
39648 case X86ISD::ADC: return combineADC(N, DAG, DCI);
39649 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
39652 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
39653 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
39654 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
39655 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
39656 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
39657 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
39658 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
39659 case ISD::STORE: return combineStore(N, DAG, Subtarget);
39660 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
39661 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
39662 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
39664 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
39665 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
39666 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
39667 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
39668 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
39669 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
39671 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
39673 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
39675 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
39676 case X86ISD::BT: return combineBT(N, DAG, DCI);
39677 case ISD::ANY_EXTEND:
39678 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
39679 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
39680 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
39681 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
39682 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
39683 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
39684 case X86ISD::PACKSS:
39685 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
39686 case X86ISD::VSHLI:
39687 case X86ISD::VSRAI:
39688 case X86ISD::VSRLI:
39689 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
39690 case ISD::SIGN_EXTEND_VECTOR_INREG:
39691 case ISD::ZERO_EXTEND_VECTOR_INREG:
39692 case X86ISD::VSEXT:
39693 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
39694 case X86ISD::PINSRB:
39695 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
39696 case X86ISD::SHUFP: // Handle all target specific shuffles
39697 case X86ISD::INSERTPS:
39698 case X86ISD::EXTRQI:
39699 case X86ISD::INSERTQI:
39700 case X86ISD::PALIGNR:
39701 case X86ISD::VSHLDQ:
39702 case X86ISD::VSRLDQ:
39703 case X86ISD::BLENDI:
39704 case X86ISD::UNPCKH:
39705 case X86ISD::UNPCKL:
39706 case X86ISD::MOVHLPS:
39707 case X86ISD::MOVLHPS:
39708 case X86ISD::PSHUFB:
39709 case X86ISD::PSHUFD:
39710 case X86ISD::PSHUFHW:
39711 case X86ISD::PSHUFLW:
39712 case X86ISD::MOVSHDUP:
39713 case X86ISD::MOVSLDUP:
39714 case X86ISD::MOVDDUP:
39715 case X86ISD::MOVSS:
39716 case X86ISD::MOVSD:
39717 case X86ISD::VBROADCAST:
39718 case X86ISD::VPPERM:
39719 case X86ISD::VPERMI:
39720 case X86ISD::VPERMV:
39721 case X86ISD::VPERMV3:
39722 case X86ISD::VPERMIL2:
39723 case X86ISD::VPERMILPI:
39724 case X86ISD::VPERMILPV:
39725 case X86ISD::VPERM2X128:
39726 case X86ISD::SHUF128:
39727 case X86ISD::VZEXT_MOVL:
39728 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
39729 case X86ISD::FMADD_RND:
39730 case X86ISD::FMSUB:
39731 case X86ISD::FMSUB_RND:
39732 case X86ISD::FNMADD:
39733 case X86ISD::FNMADD_RND:
39734 case X86ISD::FNMSUB:
39735 case X86ISD::FNMSUB_RND:
39736 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
39737 case X86ISD::FMADDSUB_RND:
39738 case X86ISD::FMSUBADD_RND:
39739 case X86ISD::FMADDSUB:
39740 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
39741 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
39742 case X86ISD::MGATHER:
39743 case X86ISD::MSCATTER:
39745 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
39746 case X86ISD::PCMPEQ:
39747 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
39748 case X86ISD::PMULDQ:
39749 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
39755 /// Return true if the target has native support for the specified value type
39756 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
39757 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
39758 /// some i16 instructions are slow.
39759 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
39760 if (!isTypeLegal(VT))
39763 // There are no vXi8 shifts.
39764 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
39767 if (VT != MVT::i16)
39774 case ISD::SIGN_EXTEND:
39775 case ISD::ZERO_EXTEND:
39776 case ISD::ANY_EXTEND:
39789 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
39790 SDValue Value, SDValue Addr,
39791 SelectionDAG &DAG) const {
39792 const Module *M = DAG.getMachineFunction().getMMI().getModule();
39793 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
39794 if (IsCFProtectionSupported) {
39795 // In case control-flow branch protection is enabled, we need to add
39796 // notrack prefix to the indirect branch.
39797 // In order to do that we create NT_BRIND SDNode.
39798 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
39799 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
39802 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
39805 /// This method query the target whether it is beneficial for dag combiner to
39806 /// promote the specified node. If true, it should return the desired promotion
39807 /// type by reference.
39808 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
39809 EVT VT = Op.getValueType();
39810 if (VT != MVT::i16)
39813 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
39814 if (!Op.hasOneUse())
39816 SDNode *User = *Op->use_begin();
39817 if (!ISD::isNormalStore(User))
39819 auto *Ld = cast<LoadSDNode>(Load);
39820 auto *St = cast<StoreSDNode>(User);
39821 return Ld->getBasePtr() == St->getBasePtr();
39824 bool Commute = false;
39825 switch (Op.getOpcode()) {
39826 default: return false;
39827 case ISD::SIGN_EXTEND:
39828 case ISD::ZERO_EXTEND:
39829 case ISD::ANY_EXTEND:
39833 SDValue N0 = Op.getOperand(0);
39834 // Look out for (store (shl (load), x)).
39835 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
39847 SDValue N0 = Op.getOperand(0);
39848 SDValue N1 = Op.getOperand(1);
39849 // Avoid disabling potential load folding opportunities.
39850 if (MayFoldLoad(N1) &&
39851 (!Commute || !isa<ConstantSDNode>(N0) ||
39852 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
39854 if (MayFoldLoad(N0) &&
39855 ((Commute && !isa<ConstantSDNode>(N1)) ||
39856 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
39865 bool X86TargetLowering::
39866 isDesirableToCombineBuildVectorToShuffleTruncate(
39867 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
39869 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
39870 "Element count mismatch");
39872 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
39873 "Shuffle Mask expected to be legal");
39875 // For 32-bit elements VPERMD is better than shuffle+truncate.
39876 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
39877 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
39880 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
39886 //===----------------------------------------------------------------------===//
39887 // X86 Inline Assembly Support
39888 //===----------------------------------------------------------------------===//
39890 // Helper to match a string separated by whitespace.
39891 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
39892 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
39894 for (StringRef Piece : Pieces) {
39895 if (!S.startswith(Piece)) // Check if the piece matches.
39898 S = S.substr(Piece.size());
39899 StringRef::size_type Pos = S.find_first_not_of(" \t");
39900 if (Pos == 0) // We matched a prefix.
39909 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
39911 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
39912 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
39913 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
39914 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
39916 if (AsmPieces.size() == 3)
39918 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
39925 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
39926 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
39928 const std::string &AsmStr = IA->getAsmString();
39930 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
39931 if (!Ty || Ty->getBitWidth() % 16 != 0)
39934 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
39935 SmallVector<StringRef, 4> AsmPieces;
39936 SplitString(AsmStr, AsmPieces, ";\n");
39938 switch (AsmPieces.size()) {
39939 default: return false;
39941 // FIXME: this should verify that we are targeting a 486 or better. If not,
39942 // we will turn this bswap into something that will be lowered to logical
39943 // ops instead of emitting the bswap asm. For now, we don't support 486 or
39944 // lower so don't worry about this.
39946 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
39947 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
39948 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
39949 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
39950 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
39951 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
39952 // No need to check constraints, nothing other than the equivalent of
39953 // "=r,0" would be valid here.
39954 return IntrinsicLowering::LowerToByteSwap(CI);
39957 // rorw $$8, ${0:w} --> llvm.bswap.i16
39958 if (CI->getType()->isIntegerTy(16) &&
39959 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39960 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
39961 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
39963 StringRef ConstraintsStr = IA->getConstraintString();
39964 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39965 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39966 if (clobbersFlagRegisters(AsmPieces))
39967 return IntrinsicLowering::LowerToByteSwap(CI);
39971 if (CI->getType()->isIntegerTy(32) &&
39972 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39973 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
39974 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
39975 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
39977 StringRef ConstraintsStr = IA->getConstraintString();
39978 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39979 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39980 if (clobbersFlagRegisters(AsmPieces))
39981 return IntrinsicLowering::LowerToByteSwap(CI);
39984 if (CI->getType()->isIntegerTy(64)) {
39985 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
39986 if (Constraints.size() >= 2 &&
39987 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
39988 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
39989 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
39990 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
39991 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
39992 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
39993 return IntrinsicLowering::LowerToByteSwap(CI);
40001 /// Given a constraint letter, return the type of constraint for this target.
40002 X86TargetLowering::ConstraintType
40003 X86TargetLowering::getConstraintType(StringRef Constraint) const {
40004 if (Constraint.size() == 1) {
40005 switch (Constraint[0]) {
40017 case 'k': // AVX512 masking registers.
40018 return C_RegisterClass;
40042 else if (Constraint.size() == 2) {
40043 switch (Constraint[0]) {
40047 switch (Constraint[1]) {
40058 return C_RegisterClass;
40062 return TargetLowering::getConstraintType(Constraint);
40065 /// Examine constraint type and operand type and determine a weight value.
40066 /// This object must already have been set up with the operand type
40067 /// and the current alternative constraint selected.
40068 TargetLowering::ConstraintWeight
40069 X86TargetLowering::getSingleConstraintMatchWeight(
40070 AsmOperandInfo &info, const char *constraint) const {
40071 ConstraintWeight weight = CW_Invalid;
40072 Value *CallOperandVal = info.CallOperandVal;
40073 // If we don't have a value, we can't do a match,
40074 // but allow it at the lowest weight.
40075 if (!CallOperandVal)
40077 Type *type = CallOperandVal->getType();
40078 // Look at the constraint type.
40079 switch (*constraint) {
40081 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
40093 if (CallOperandVal->getType()->isIntegerTy())
40094 weight = CW_SpecificReg;
40099 if (type->isFloatingPointTy())
40100 weight = CW_SpecificReg;
40103 if (type->isX86_MMXTy() && Subtarget.hasMMX())
40104 weight = CW_SpecificReg;
40107 unsigned Size = StringRef(constraint).size();
40108 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
40109 char NextChar = Size == 2 ? constraint[1] : 'i';
40112 switch (NextChar) {
40118 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
40119 return CW_SpecificReg;
40121 // Conditional OpMask regs (AVX512)
40123 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
40124 return CW_Register;
40128 if (type->isX86_MMXTy() && Subtarget.hasMMX())
40131 // Any SSE reg when ISA >= SSE2, same as 'Y'
40135 if (!Subtarget.hasSSE2())
40139 // Fall through (handle "Y" constraint).
40143 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
40144 weight = CW_Register;
40147 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
40148 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
40149 weight = CW_Register;
40152 // Enable conditional vector operations using %k<#> registers.
40153 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
40154 weight = CW_Register;
40157 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
40158 if (C->getZExtValue() <= 31)
40159 weight = CW_Constant;
40163 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40164 if (C->getZExtValue() <= 63)
40165 weight = CW_Constant;
40169 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40170 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
40171 weight = CW_Constant;
40175 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40176 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
40177 weight = CW_Constant;
40181 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40182 if (C->getZExtValue() <= 3)
40183 weight = CW_Constant;
40187 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40188 if (C->getZExtValue() <= 0xff)
40189 weight = CW_Constant;
40194 if (isa<ConstantFP>(CallOperandVal)) {
40195 weight = CW_Constant;
40199 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40200 if ((C->getSExtValue() >= -0x80000000LL) &&
40201 (C->getSExtValue() <= 0x7fffffffLL))
40202 weight = CW_Constant;
40206 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40207 if (C->getZExtValue() <= 0xffffffff)
40208 weight = CW_Constant;
40215 /// Try to replace an X constraint, which matches anything, with another that
40216 /// has more specific requirements based on the type of the corresponding
40218 const char *X86TargetLowering::
40219 LowerXConstraint(EVT ConstraintVT) const {
40220 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
40221 // 'f' like normal targets.
40222 if (ConstraintVT.isFloatingPoint()) {
40223 if (Subtarget.hasSSE2())
40225 if (Subtarget.hasSSE1())
40229 return TargetLowering::LowerXConstraint(ConstraintVT);
40232 /// Lower the specified operand into the Ops vector.
40233 /// If it is invalid, don't add anything to Ops.
40234 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
40235 std::string &Constraint,
40236 std::vector<SDValue>&Ops,
40237 SelectionDAG &DAG) const {
40240 // Only support length 1 constraints for now.
40241 if (Constraint.length() > 1) return;
40243 char ConstraintLetter = Constraint[0];
40244 switch (ConstraintLetter) {
40247 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40248 if (C->getZExtValue() <= 31) {
40249 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40250 Op.getValueType());
40256 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40257 if (C->getZExtValue() <= 63) {
40258 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40259 Op.getValueType());
40265 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40266 if (isInt<8>(C->getSExtValue())) {
40267 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40268 Op.getValueType());
40274 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40275 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
40276 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
40277 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
40278 Op.getValueType());
40284 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40285 if (C->getZExtValue() <= 3) {
40286 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40287 Op.getValueType());
40293 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40294 if (C->getZExtValue() <= 255) {
40295 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40296 Op.getValueType());
40302 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40303 if (C->getZExtValue() <= 127) {
40304 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40305 Op.getValueType());
40311 // 32-bit signed value
40312 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40313 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40314 C->getSExtValue())) {
40315 // Widen to 64 bits here to get it sign extended.
40316 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
40319 // FIXME gcc accepts some relocatable values here too, but only in certain
40320 // memory models; it's complicated.
40325 // 32-bit unsigned value
40326 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40327 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40328 C->getZExtValue())) {
40329 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40330 Op.getValueType());
40334 // FIXME gcc accepts some relocatable values here too, but only in certain
40335 // memory models; it's complicated.
40339 // Literal immediates are always ok.
40340 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
40341 // Widen to 64 bits here to get it sign extended.
40342 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
40346 // In any sort of PIC mode addresses need to be computed at runtime by
40347 // adding in a register or some sort of table lookup. These can't
40348 // be used as immediates.
40349 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
40352 // If we are in non-pic codegen mode, we allow the address of a global (with
40353 // an optional displacement) to be used with 'i'.
40354 GlobalAddressSDNode *GA = nullptr;
40355 int64_t Offset = 0;
40357 // Match either (GA), (GA+C), (GA+C1+C2), etc.
40359 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
40360 Offset += GA->getOffset();
40362 } else if (Op.getOpcode() == ISD::ADD) {
40363 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40364 Offset += C->getZExtValue();
40365 Op = Op.getOperand(0);
40368 } else if (Op.getOpcode() == ISD::SUB) {
40369 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40370 Offset += -C->getZExtValue();
40371 Op = Op.getOperand(0);
40376 // Otherwise, this isn't something we can handle, reject it.
40380 const GlobalValue *GV = GA->getGlobal();
40381 // If we require an extra load to get this address, as in PIC mode, we
40382 // can't accept it.
40383 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
40386 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
40387 GA->getValueType(0), Offset);
40392 if (Result.getNode()) {
40393 Ops.push_back(Result);
40396 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
40399 /// Check if \p RC is a general purpose register class.
40400 /// I.e., GR* or one of their variant.
40401 static bool isGRClass(const TargetRegisterClass &RC) {
40402 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
40403 RC.hasSuperClassEq(&X86::GR16RegClass) ||
40404 RC.hasSuperClassEq(&X86::GR32RegClass) ||
40405 RC.hasSuperClassEq(&X86::GR64RegClass) ||
40406 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
40409 /// Check if \p RC is a vector register class.
40410 /// I.e., FR* / VR* or one of their variant.
40411 static bool isFRClass(const TargetRegisterClass &RC) {
40412 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
40413 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
40414 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
40415 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
40416 RC.hasSuperClassEq(&X86::VR512RegClass);
40419 std::pair<unsigned, const TargetRegisterClass *>
40420 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
40421 StringRef Constraint,
40423 // First, see if this is a constraint that directly corresponds to an LLVM
40425 if (Constraint.size() == 1) {
40426 // GCC Constraint Letters
40427 switch (Constraint[0]) {
40429 // TODO: Slight differences here in allocation order and leaving
40430 // RIP in the class. Do they matter any more here than they do
40431 // in the normal allocation?
40433 if (Subtarget.hasAVX512()) {
40434 // Only supported in AVX512 or later.
40435 switch (VT.SimpleTy) {
40438 return std::make_pair(0U, &X86::VK32RegClass);
40440 return std::make_pair(0U, &X86::VK16RegClass);
40442 return std::make_pair(0U, &X86::VK8RegClass);
40444 return std::make_pair(0U, &X86::VK1RegClass);
40446 return std::make_pair(0U, &X86::VK64RegClass);
40450 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
40451 if (Subtarget.is64Bit()) {
40452 if (VT == MVT::i32 || VT == MVT::f32)
40453 return std::make_pair(0U, &X86::GR32RegClass);
40454 if (VT == MVT::i16)
40455 return std::make_pair(0U, &X86::GR16RegClass);
40456 if (VT == MVT::i8 || VT == MVT::i1)
40457 return std::make_pair(0U, &X86::GR8RegClass);
40458 if (VT == MVT::i64 || VT == MVT::f64)
40459 return std::make_pair(0U, &X86::GR64RegClass);
40463 // 32-bit fallthrough
40464 case 'Q': // Q_REGS
40465 if (VT == MVT::i32 || VT == MVT::f32)
40466 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
40467 if (VT == MVT::i16)
40468 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
40469 if (VT == MVT::i8 || VT == MVT::i1)
40470 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
40471 if (VT == MVT::i64)
40472 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
40474 case 'r': // GENERAL_REGS
40475 case 'l': // INDEX_REGS
40476 if (VT == MVT::i8 || VT == MVT::i1)
40477 return std::make_pair(0U, &X86::GR8RegClass);
40478 if (VT == MVT::i16)
40479 return std::make_pair(0U, &X86::GR16RegClass);
40480 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
40481 return std::make_pair(0U, &X86::GR32RegClass);
40482 return std::make_pair(0U, &X86::GR64RegClass);
40483 case 'R': // LEGACY_REGS
40484 if (VT == MVT::i8 || VT == MVT::i1)
40485 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
40486 if (VT == MVT::i16)
40487 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
40488 if (VT == MVT::i32 || !Subtarget.is64Bit())
40489 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
40490 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
40491 case 'f': // FP Stack registers.
40492 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
40493 // value to the correct fpstack register class.
40494 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
40495 return std::make_pair(0U, &X86::RFP32RegClass);
40496 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
40497 return std::make_pair(0U, &X86::RFP64RegClass);
40498 return std::make_pair(0U, &X86::RFP80RegClass);
40499 case 'y': // MMX_REGS if MMX allowed.
40500 if (!Subtarget.hasMMX()) break;
40501 return std::make_pair(0U, &X86::VR64RegClass);
40502 case 'Y': // SSE_REGS if SSE2 allowed
40503 if (!Subtarget.hasSSE2()) break;
40506 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
40507 if (!Subtarget.hasSSE1()) break;
40508 bool VConstraint = (Constraint[0] == 'v');
40510 switch (VT.SimpleTy) {
40512 // Scalar SSE types.
40515 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
40516 return std::make_pair(0U, &X86::FR32XRegClass);
40517 return std::make_pair(0U, &X86::FR32RegClass);
40520 if (VConstraint && Subtarget.hasVLX())
40521 return std::make_pair(0U, &X86::FR64XRegClass);
40522 return std::make_pair(0U, &X86::FR64RegClass);
40523 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40531 if (VConstraint && Subtarget.hasVLX())
40532 return std::make_pair(0U, &X86::VR128XRegClass);
40533 return std::make_pair(0U, &X86::VR128RegClass);
40541 if (VConstraint && Subtarget.hasVLX())
40542 return std::make_pair(0U, &X86::VR256XRegClass);
40543 return std::make_pair(0U, &X86::VR256RegClass);
40548 return std::make_pair(0U, &X86::VR512RegClass);
40552 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
40553 switch (Constraint[1]) {
40559 return getRegForInlineAsmConstraint(TRI, "Y", VT);
40561 if (!Subtarget.hasMMX()) break;
40562 return std::make_pair(0U, &X86::VR64RegClass);
40565 if (!Subtarget.hasSSE1()) break;
40566 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
40568 // This register class doesn't allocate k0 for masked vector operation.
40569 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
40570 switch (VT.SimpleTy) {
40573 return std::make_pair(0U, &X86::VK32WMRegClass);
40575 return std::make_pair(0U, &X86::VK16WMRegClass);
40577 return std::make_pair(0U, &X86::VK8WMRegClass);
40579 return std::make_pair(0U, &X86::VK1WMRegClass);
40581 return std::make_pair(0U, &X86::VK64WMRegClass);
40588 // Use the default implementation in TargetLowering to convert the register
40589 // constraint into a member of a register class.
40590 std::pair<unsigned, const TargetRegisterClass*> Res;
40591 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
40593 // Not found as a standard register?
40595 // Map st(0) -> st(7) -> ST0
40596 if (Constraint.size() == 7 && Constraint[0] == '{' &&
40597 tolower(Constraint[1]) == 's' &&
40598 tolower(Constraint[2]) == 't' &&
40599 Constraint[3] == '(' &&
40600 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
40601 Constraint[5] == ')' &&
40602 Constraint[6] == '}') {
40604 Res.first = X86::FP0+Constraint[4]-'0';
40605 Res.second = &X86::RFP80RegClass;
40609 // GCC allows "st(0)" to be called just plain "st".
40610 if (StringRef("{st}").equals_lower(Constraint)) {
40611 Res.first = X86::FP0;
40612 Res.second = &X86::RFP80RegClass;
40617 if (StringRef("{flags}").equals_lower(Constraint)) {
40618 Res.first = X86::EFLAGS;
40619 Res.second = &X86::CCRRegClass;
40623 // 'A' means [ER]AX + [ER]DX.
40624 if (Constraint == "A") {
40625 if (Subtarget.is64Bit()) {
40626 Res.first = X86::RAX;
40627 Res.second = &X86::GR64_ADRegClass;
40629 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
40630 "Expecting 64, 32 or 16 bit subtarget");
40631 Res.first = X86::EAX;
40632 Res.second = &X86::GR32_ADRegClass;
40639 // Make sure it isn't a register that requires 64-bit mode.
40640 if (!Subtarget.is64Bit() &&
40641 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
40642 TRI->getEncodingValue(Res.first) >= 8) {
40643 // Register requires REX prefix, but we're in 32-bit mode.
40645 Res.second = nullptr;
40649 // Make sure it isn't a register that requires AVX512.
40650 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
40651 TRI->getEncodingValue(Res.first) & 0x10) {
40652 // Register requires EVEX prefix.
40654 Res.second = nullptr;
40658 // Otherwise, check to see if this is a register class of the wrong value
40659 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
40660 // turn into {ax},{dx}.
40661 // MVT::Other is used to specify clobber names.
40662 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
40663 return Res; // Correct type already, nothing to do.
40665 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
40666 // return "eax". This should even work for things like getting 64bit integer
40667 // registers when given an f64 type.
40668 const TargetRegisterClass *Class = Res.second;
40669 // The generic code will match the first register class that contains the
40670 // given register. Thus, based on the ordering of the tablegened file,
40671 // the "plain" GR classes might not come first.
40672 // Therefore, use a helper method.
40673 if (isGRClass(*Class)) {
40674 unsigned Size = VT.getSizeInBits();
40675 if (Size == 1) Size = 8;
40676 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
40678 bool is64Bit = Subtarget.is64Bit();
40679 const TargetRegisterClass *RC =
40680 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
40681 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
40682 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
40683 : &X86::GR64RegClass;
40684 if (RC->contains(DestReg))
40685 Res = std::make_pair(DestReg, RC);
40687 // No register found/type mismatch.
40689 Res.second = nullptr;
40691 } else if (isFRClass(*Class)) {
40692 // Handle references to XMM physical registers that got mapped into the
40693 // wrong class. This can happen with constraints like {xmm0} where the
40694 // target independent register mapper will just pick the first match it can
40695 // find, ignoring the required type.
40697 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40698 if (VT == MVT::f32 || VT == MVT::i32)
40699 Res.second = &X86::FR32RegClass;
40700 else if (VT == MVT::f64 || VT == MVT::i64)
40701 Res.second = &X86::FR64RegClass;
40702 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
40703 Res.second = &X86::VR128RegClass;
40704 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
40705 Res.second = &X86::VR256RegClass;
40706 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
40707 Res.second = &X86::VR512RegClass;
40709 // Type mismatch and not a clobber: Return an error;
40711 Res.second = nullptr;
40718 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
40719 const AddrMode &AM, Type *Ty,
40720 unsigned AS) const {
40721 // Scaling factors are not free at all.
40722 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
40723 // will take 2 allocations in the out of order engine instead of 1
40724 // for plain addressing mode, i.e. inst (reg1).
40726 // vaddps (%rsi,%rdx), %ymm0, %ymm1
40727 // Requires two allocations (one for the load, one for the computation)
40729 // vaddps (%rsi), %ymm0, %ymm1
40730 // Requires just 1 allocation, i.e., freeing allocations for other operations
40731 // and having less micro operations to execute.
40733 // For some X86 architectures, this is even worse because for instance for
40734 // stores, the complex addressing mode forces the instruction to use the
40735 // "load" ports instead of the dedicated "store" port.
40736 // E.g., on Haswell:
40737 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
40738 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
40739 if (isLegalAddressingMode(DL, AM, Ty, AS))
40740 // Scale represents reg2 * scale, thus account for 1
40741 // as soon as we use a second register.
40742 return AM.Scale != 0;
40746 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
40747 // Integer division on x86 is expensive. However, when aggressively optimizing
40748 // for code size, we prefer to use a div instruction, as it is usually smaller
40749 // than the alternative sequence.
40750 // The exception to this is vector division. Since x86 doesn't have vector
40751 // integer division, leaving the division as-is is a loss even in terms of
40752 // size, because it will have to be scalarized, while the alternative code
40753 // sequence can be performed in vector form.
40755 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
40756 return OptSize && !VT.isVector();
40759 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
40760 if (!Subtarget.is64Bit())
40763 // Update IsSplitCSR in X86MachineFunctionInfo.
40764 X86MachineFunctionInfo *AFI =
40765 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
40766 AFI->setIsSplitCSR(true);
40769 void X86TargetLowering::insertCopiesSplitCSR(
40770 MachineBasicBlock *Entry,
40771 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
40772 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
40773 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
40777 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
40778 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
40779 MachineBasicBlock::iterator MBBI = Entry->begin();
40780 for (const MCPhysReg *I = IStart; *I; ++I) {
40781 const TargetRegisterClass *RC = nullptr;
40782 if (X86::GR64RegClass.contains(*I))
40783 RC = &X86::GR64RegClass;
40785 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
40787 unsigned NewVR = MRI->createVirtualRegister(RC);
40788 // Create copy from CSR to a virtual register.
40789 // FIXME: this currently does not emit CFI pseudo-instructions, it works
40790 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
40791 // nounwind. If we want to generalize this later, we may need to emit
40792 // CFI pseudo-instructions.
40793 assert(Entry->getParent()->getFunction().hasFnAttribute(
40794 Attribute::NoUnwind) &&
40795 "Function should be nounwind in insertCopiesSplitCSR!");
40796 Entry->addLiveIn(*I);
40797 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
40800 // Insert the copy-back instructions right before the terminator.
40801 for (auto *Exit : Exits)
40802 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
40803 TII->get(TargetOpcode::COPY), *I)
40808 bool X86TargetLowering::supportSwiftError() const {
40809 return Subtarget.is64Bit();
40812 /// Returns the name of the symbol used to emit stack probes or the empty
40813 /// string if not applicable.
40814 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
40815 // If the function specifically requests stack probes, emit them.
40816 if (MF.getFunction().hasFnAttribute("probe-stack"))
40817 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
40819 // Generally, if we aren't on Windows, the platform ABI does not include
40820 // support for stack probes, so don't emit them.
40821 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
40822 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
40825 // We need a stack probe to conform to the Windows ABI. Choose the right
40827 if (Subtarget.is64Bit())
40828 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
40829 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";