1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
223 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
225 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
228 if (!Subtarget.useSoftFloat()) {
229 // SSE has no i16 to fp conversion, only i32.
230 if (X86ScalarSSEf32) {
231 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
232 // f32 and f64 cases are Legal, f80 case is not
233 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
239 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
243 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
245 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
246 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
248 if (!Subtarget.useSoftFloat()) {
249 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
250 // are Legal, f80 is custom lowered.
251 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
252 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
254 if (X86ScalarSSEf32) {
255 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
256 // f32 and f64 cases are Legal, f80 case is not
257 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
260 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
265 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
268 // Handle FP_TO_UINT by promoting the destination to a larger signed
270 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
272 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
274 if (Subtarget.is64Bit()) {
275 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
276 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
283 } else if (!Subtarget.useSoftFloat()) {
284 // Since AVX is a superset of SSE3, only check for SSE here.
285 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
286 // Expand FP_TO_UINT into a select.
287 // FIXME: We would like to use a Custom expander here eventually to do
288 // the optimal thing for SSE vs. the default expansion in the legalizer.
289 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
291 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
292 // With SSE3 we can use fisttpll to convert to a signed i64; without
293 // SSE, we're stuck with a fistpll.
294 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
296 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
299 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
300 if (!X86ScalarSSEf64) {
301 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
302 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
305 // Without SSE, i64->f64 goes through memory.
306 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
308 } else if (!Subtarget.is64Bit())
309 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
311 // Scalar integer divide and remainder are lowered to use operations that
312 // produce two results, to match the available instructions. This exposes
313 // the two-result form to trivial CSE, which is able to combine x/y and x%y
314 // into a single instruction.
316 // Scalar integer multiply-high is also lowered to use two-result
317 // operations, to match the available instructions. However, plain multiply
318 // (low) operations are left as Legal, as there are single-result
319 // instructions for this in x86. Using the two-result multiply instructions
320 // when both high and low results are needed must be arranged by dagcombine.
321 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
322 setOperationAction(ISD::MULHS, VT, Expand);
323 setOperationAction(ISD::MULHU, VT, Expand);
324 setOperationAction(ISD::SDIV, VT, Expand);
325 setOperationAction(ISD::UDIV, VT, Expand);
326 setOperationAction(ISD::SREM, VT, Expand);
327 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
387 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
390 // There's never any support for operations beyond MVT::f32.
391 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
392 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
393 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
403 if (Subtarget.hasPOPCNT()) {
404 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
406 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
407 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
409 if (Subtarget.is64Bit())
410 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
482 if (Subtarget.hasCmpxchg16b()) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
486 // FIXME - use subtarget debug flags
487 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
488 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
490 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
493 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499 setOperationAction(ISD::TRAP, MVT::Other, Legal);
500 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503 setOperationAction(ISD::VASTART , MVT::Other, Custom);
504 setOperationAction(ISD::VAEND , MVT::Other, Expand);
505 bool Is64Bit = Subtarget.is64Bit();
506 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
510 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
512 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
514 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
516 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
518 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
519 // f32 and f64 use SSE.
520 // Set up the FP register classes.
521 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
522 : &X86::FR32RegClass);
523 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
524 : &X86::FR64RegClass);
526 for (auto VT : { MVT::f32, MVT::f64 }) {
527 // Use ANDPD to simulate FABS.
528 setOperationAction(ISD::FABS, VT, Custom);
530 // Use XORP to simulate FNEG.
531 setOperationAction(ISD::FNEG, VT, Custom);
533 // Use ANDPD and ORPD to simulate FCOPYSIGN.
534 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
536 // We don't support sin/cos/fmod
537 setOperationAction(ISD::FSIN , VT, Expand);
538 setOperationAction(ISD::FCOS , VT, Expand);
539 setOperationAction(ISD::FSINCOS, VT, Expand);
542 // Lower this to MOVMSK plus an AND.
543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
546 // Expand FP immediates into loads from the stack, except for the special
548 addLegalFPImmediate(APFloat(+0.0)); // xorpd
549 addLegalFPImmediate(APFloat(+0.0f)); // xorps
550 } else if (UseX87 && X86ScalarSSEf32) {
551 // Use SSE for f32, x87 for f64.
552 // Set up the FP register classes.
553 addRegisterClass(MVT::f32, &X86::FR32RegClass);
554 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
556 // Use ANDPS to simulate FABS.
557 setOperationAction(ISD::FABS , MVT::f32, Custom);
559 // Use XORP to simulate FNEG.
560 setOperationAction(ISD::FNEG , MVT::f32, Custom);
562 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
564 // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
568 // We don't support sin/cos/fmod
569 setOperationAction(ISD::FSIN , MVT::f32, Expand);
570 setOperationAction(ISD::FCOS , MVT::f32, Expand);
571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
573 // Special cases we handle for FP constants.
574 addLegalFPImmediate(APFloat(+0.0f)); // xorps
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
580 // Always expand sin/cos functions even though x87 has an instruction.
581 setOperationAction(ISD::FSIN , MVT::f64, Expand);
582 setOperationAction(ISD::FCOS , MVT::f64, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
585 // f32 and f64 in x87.
586 // Set up the FP register classes.
587 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
588 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 for (auto VT : { MVT::f32, MVT::f64 }) {
591 setOperationAction(ISD::UNDEF, VT, Expand);
592 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 // Always expand sin/cos functions even though x87 has an instruction.
595 setOperationAction(ISD::FSIN , VT, Expand);
596 setOperationAction(ISD::FCOS , VT, Expand);
597 setOperationAction(ISD::FSINCOS, VT, Expand);
599 addLegalFPImmediate(APFloat(+0.0)); // FLD0
600 addLegalFPImmediate(APFloat(+1.0)); // FLD1
601 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
602 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
603 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609 // We don't support FMA.
610 setOperationAction(ISD::FMA, MVT::f64, Expand);
611 setOperationAction(ISD::FMA, MVT::f32, Expand);
613 // Long double always uses X87, except f128 in MMX.
615 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
616 addRegisterClass(MVT::f128, &X86::VR128RegClass);
617 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
618 setOperationAction(ISD::FABS , MVT::f128, Custom);
619 setOperationAction(ISD::FNEG , MVT::f128, Custom);
620 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
623 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
625 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
627 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
628 addLegalFPImmediate(TmpFlt); // FLD0
630 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
633 APFloat TmpFlt2(+1.0);
634 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
636 addLegalFPImmediate(TmpFlt2); // FLD1
637 TmpFlt2.changeSign();
638 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
641 // Always expand sin/cos functions even though x87 has an instruction.
642 setOperationAction(ISD::FSIN , MVT::f80, Expand);
643 setOperationAction(ISD::FCOS , MVT::f80, Expand);
644 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
646 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
647 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
648 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
649 setOperationAction(ISD::FRINT, MVT::f80, Expand);
650 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
651 setOperationAction(ISD::FMA, MVT::f80, Expand);
654 // Always use a library call for pow.
655 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
657 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
659 setOperationAction(ISD::FLOG, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
661 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP, MVT::f80, Expand);
663 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
664 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
665 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
667 // Some FP actions are always expanded for vector types.
668 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
669 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
670 setOperationAction(ISD::FSIN, VT, Expand);
671 setOperationAction(ISD::FSINCOS, VT, Expand);
672 setOperationAction(ISD::FCOS, VT, Expand);
673 setOperationAction(ISD::FREM, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
796 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
797 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
798 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
799 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
806 // Provide custom widening for v2f32 setcc. This is really for VLX when
807 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
808 // type legalization changing the result type to v4i1 during widening.
809 // It works fine for SSE2 and is probably faster so no need to qualify with
811 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
813 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
814 setOperationAction(ISD::SETCC, VT, Custom);
815 setOperationAction(ISD::CTPOP, VT, Custom);
816 setOperationAction(ISD::CTTZ, VT, Custom);
818 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
819 // setcc all the way to isel and prefer SETGT in some isel patterns.
820 setCondCodeAction(ISD::SETLT, VT, Custom);
821 setCondCodeAction(ISD::SETLE, VT, Custom);
824 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
825 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
826 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
827 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
828 setOperationAction(ISD::VSELECT, VT, Custom);
829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
832 // We support custom legalizing of sext and anyext loads for specific
833 // memory vector types which we can load as a scalar (or sequence of
834 // scalars) and extend in-register to a legal 128-bit vector type. For sext
835 // loads these must work with a single scalar load.
836 for (MVT VT : MVT::integer_vector_valuetypes()) {
837 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
838 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
839 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
840 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
841 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
842 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
843 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
844 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
845 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
848 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
850 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
851 setOperationAction(ISD::VSELECT, VT, Custom);
853 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
856 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
860 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
861 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
862 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
863 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
864 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
865 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
866 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
869 // Custom lower v2i64 and v2f64 selects.
870 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
871 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
873 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
874 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
876 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
877 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
879 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
881 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
882 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
884 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
885 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
887 for (MVT VT : MVT::fp_vector_valuetypes())
888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
890 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
891 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
892 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
893 if (!Subtarget.hasAVX512())
894 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
896 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
897 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
898 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
900 // In the customized shift lowering, the legal v4i32/v2i64 cases
901 // in AVX2 will be recognized.
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SRL, VT, Custom);
904 setOperationAction(ISD::SHL, VT, Custom);
905 setOperationAction(ISD::SRA, VT, Custom);
908 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
909 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
910 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
913 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
914 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
915 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
916 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
917 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
918 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
919 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
920 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
921 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
924 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
925 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
926 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
927 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
928 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
929 setOperationAction(ISD::FRINT, RoundedTy, Legal);
930 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
933 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
934 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
935 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
936 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
937 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
938 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
939 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
940 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
942 // FIXME: Do we need to handle scalar-to-vector here?
943 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
945 // We directly match byte blends in the backend as they match the VSELECT
947 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
949 // SSE41 brings specific instructions for doing vector sign extend even in
950 // cases where we don't have SRA.
951 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
952 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
953 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
956 for (MVT VT : MVT::integer_vector_valuetypes()) {
957 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
958 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
959 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
962 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
963 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
964 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
965 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
966 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
967 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
968 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
969 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
970 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
973 // i8 vectors are custom because the source register and source
974 // source memory operand types are not the same width.
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
978 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
979 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
980 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
981 setOperationAction(ISD::ROTL, VT, Custom);
983 // XOP can efficiently perform BITREVERSE with VPPERM.
984 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
985 setOperationAction(ISD::BITREVERSE, VT, Custom);
987 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
988 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
989 setOperationAction(ISD::BITREVERSE, VT, Custom);
992 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
993 bool HasInt256 = Subtarget.hasInt256();
995 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1001 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1002 : &X86::VR256RegClass);
1003 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1004 : &X86::VR256RegClass);
1005 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1006 : &X86::VR256RegClass);
1008 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1009 setOperationAction(ISD::FFLOOR, VT, Legal);
1010 setOperationAction(ISD::FCEIL, VT, Legal);
1011 setOperationAction(ISD::FTRUNC, VT, Legal);
1012 setOperationAction(ISD::FRINT, VT, Legal);
1013 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1014 setOperationAction(ISD::FNEG, VT, Custom);
1015 setOperationAction(ISD::FABS, VT, Custom);
1016 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1019 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1020 // even though v8i16 is a legal type.
1021 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1022 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1025 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1026 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1028 if (!Subtarget.hasAVX512())
1029 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1031 for (MVT VT : MVT::fp_vector_valuetypes())
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1034 // In the customized shift lowering, the legal v8i32/v4i64 cases
1035 // in AVX2 will be recognized.
1036 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037 setOperationAction(ISD::SRL, VT, Custom);
1038 setOperationAction(ISD::SHL, VT, Custom);
1039 setOperationAction(ISD::SRA, VT, Custom);
1042 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1043 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1044 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1046 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1047 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1048 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1050 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1051 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1052 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1053 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1056 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1057 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1058 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1059 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1061 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1062 setOperationAction(ISD::SETCC, VT, Custom);
1063 setOperationAction(ISD::CTPOP, VT, Custom);
1064 setOperationAction(ISD::CTTZ, VT, Custom);
1065 setOperationAction(ISD::CTLZ, VT, Custom);
1067 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1068 // setcc all the way to isel and prefer SETGT in some isel patterns.
1069 setCondCodeAction(ISD::SETLT, VT, Custom);
1070 setCondCodeAction(ISD::SETLE, VT, Custom);
1073 if (Subtarget.hasAnyFMA()) {
1074 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1075 MVT::v2f64, MVT::v4f64 })
1076 setOperationAction(ISD::FMA, VT, Legal);
1079 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1080 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1081 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1085 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1086 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1087 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1089 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1090 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1092 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1094 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1095 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1097 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1098 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1099 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1100 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1102 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1103 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1104 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1105 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1106 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1107 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1111 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1112 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1113 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1115 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1116 // when we have a 256bit-wide blend with immediate.
1117 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1119 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1120 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1121 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1122 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1123 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1124 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1125 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1126 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1130 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1131 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1132 setOperationAction(ISD::MLOAD, VT, Legal);
1133 setOperationAction(ISD::MSTORE, VT, Legal);
1136 // Extract subvector is special because the value type
1137 // (result) is 128-bit but the source is 256-bit wide.
1138 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1139 MVT::v4f32, MVT::v2f64 }) {
1140 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1143 // Custom lower several nodes for 256-bit types.
1144 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1145 MVT::v8f32, MVT::v4f64 }) {
1146 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1147 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1148 setOperationAction(ISD::VSELECT, VT, Custom);
1149 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1151 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1152 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1153 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1157 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1159 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1160 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1161 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1165 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1169 // Custom legalize 2x32 to get a little better code.
1170 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1171 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1175 setOperationAction(ISD::MGATHER, VT, Custom);
1179 // This block controls legalization of the mask vector sizes that are
1180 // available with AVX512. 512-bit vectors are in a separate block controlled
1181 // by useAVX512Regs.
1182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1183 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1184 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1185 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1186 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1187 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1189 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1190 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1191 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1193 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1194 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1195 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1196 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1197 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1200 // There is no byte sized k-register load or store without AVX512DQ.
1201 if (!Subtarget.hasDQI()) {
1202 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1205 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1207 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1208 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1209 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1210 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1213 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1214 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1215 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1216 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1217 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1220 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1221 setOperationAction(ISD::ADD, VT, Custom);
1222 setOperationAction(ISD::SUB, VT, Custom);
1223 setOperationAction(ISD::MUL, VT, Custom);
1224 setOperationAction(ISD::SETCC, VT, Custom);
1225 setOperationAction(ISD::SELECT, VT, Custom);
1226 setOperationAction(ISD::TRUNCATE, VT, Custom);
1228 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1229 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1230 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1231 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1232 setOperationAction(ISD::VSELECT, VT, Expand);
1235 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1236 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1237 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1238 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1239 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1240 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1241 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1242 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1243 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1246 // This block controls legalization for 512-bit operations with 32/64 bit
1247 // elements. 512-bits can be disabled based on prefer-vector-width and
1248 // required-vector-width function attributes.
1249 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1250 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1251 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1252 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1253 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1255 for (MVT VT : MVT::fp_vector_valuetypes())
1256 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1258 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1259 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1260 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1261 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1262 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1263 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1266 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FNEG, VT, Custom);
1268 setOperationAction(ISD::FABS, VT, Custom);
1269 setOperationAction(ISD::FMA, VT, Legal);
1270 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1273 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1274 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1275 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1276 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1278 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1279 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1280 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1281 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1282 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1284 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1285 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1286 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1287 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1288 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1290 if (!Subtarget.hasVLX()) {
1291 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1292 // to 512-bit rather than use the AVX2 instructions so that we can use
1294 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1295 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1296 setOperationAction(ISD::MLOAD, VT, Custom);
1297 setOperationAction(ISD::MSTORE, VT, Custom);
1301 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1302 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1304 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1305 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1310 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1311 setOperationAction(ISD::FFLOOR, VT, Legal);
1312 setOperationAction(ISD::FCEIL, VT, Legal);
1313 setOperationAction(ISD::FTRUNC, VT, Legal);
1314 setOperationAction(ISD::FRINT, VT, Legal);
1315 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1318 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1319 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1321 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1323 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1327 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1330 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1331 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1333 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1334 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1336 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1337 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1338 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1340 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1341 setOperationAction(ISD::SMAX, VT, Legal);
1342 setOperationAction(ISD::UMAX, VT, Legal);
1343 setOperationAction(ISD::SMIN, VT, Legal);
1344 setOperationAction(ISD::UMIN, VT, Legal);
1345 setOperationAction(ISD::ABS, VT, Legal);
1346 setOperationAction(ISD::SRL, VT, Custom);
1347 setOperationAction(ISD::SHL, VT, Custom);
1348 setOperationAction(ISD::SRA, VT, Custom);
1349 setOperationAction(ISD::CTPOP, VT, Custom);
1350 setOperationAction(ISD::CTTZ, VT, Custom);
1351 setOperationAction(ISD::ROTL, VT, Custom);
1352 setOperationAction(ISD::ROTR, VT, Custom);
1353 setOperationAction(ISD::SETCC, VT, Custom);
1355 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1356 // setcc all the way to isel and prefer SETGT in some isel patterns.
1357 setCondCodeAction(ISD::SETLT, VT, Custom);
1358 setCondCodeAction(ISD::SETLE, VT, Custom);
1361 // Need to promote to 64-bit even though we have 32-bit masked instructions
1362 // because the IR optimizers rearrange bitcasts around logic ops leaving
1363 // too many variations to handle if we don't promote them.
1364 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1365 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1366 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1368 if (Subtarget.hasDQI()) {
1369 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1370 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1371 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1377 if (Subtarget.hasCDI()) {
1378 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1379 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1380 setOperationAction(ISD::CTLZ, VT, Legal);
1381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1383 } // Subtarget.hasCDI()
1385 if (Subtarget.hasVPOPCNTDQ()) {
1386 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1387 setOperationAction(ISD::CTPOP, VT, Legal);
1390 // Extract subvector is special because the value type
1391 // (result) is 256-bit but the source is 512-bit wide.
1392 // 128-bit was made Legal under AVX1.
1393 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1394 MVT::v8f32, MVT::v4f64 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Custom);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1415 // Need to custom split v32i16/v64i8 bitcasts.
1416 if (!Subtarget.hasBWI()) {
1417 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1418 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1422 // This block controls legalization for operations that don't have
1423 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1425 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1426 // These operations are handled on non-VLX by artificially widening in
1428 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1430 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1431 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1432 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1433 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1434 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1436 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1437 setOperationAction(ISD::SMAX, VT, Legal);
1438 setOperationAction(ISD::UMAX, VT, Legal);
1439 setOperationAction(ISD::SMIN, VT, Legal);
1440 setOperationAction(ISD::UMIN, VT, Legal);
1441 setOperationAction(ISD::ABS, VT, Legal);
1444 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1445 setOperationAction(ISD::ROTL, VT, Custom);
1446 setOperationAction(ISD::ROTR, VT, Custom);
1449 // Custom legalize 2x32 to get a little better code.
1450 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1451 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1453 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1454 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1455 setOperationAction(ISD::MSCATTER, VT, Custom);
1457 if (Subtarget.hasDQI()) {
1458 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1459 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1461 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1462 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1464 setOperationAction(ISD::MUL, VT, Legal);
1468 if (Subtarget.hasCDI()) {
1469 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1470 setOperationAction(ISD::CTLZ, VT, Legal);
1471 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1473 } // Subtarget.hasCDI()
1475 if (Subtarget.hasVPOPCNTDQ()) {
1476 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1477 setOperationAction(ISD::CTPOP, VT, Legal);
1481 // This block control legalization of v32i1/v64i1 which are available with
1482 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1484 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1489 setOperationAction(ISD::ADD, VT, Custom);
1490 setOperationAction(ISD::SUB, VT, Custom);
1491 setOperationAction(ISD::MUL, VT, Custom);
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1494 setOperationAction(ISD::TRUNCATE, VT, Custom);
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1497 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1508 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1510 // Extends from v32i1 masks to 256-bit vectors.
1511 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1512 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1513 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1516 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1517 // disabled based on prefer-vector-width and required-vector-width function
1519 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1520 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1523 // Extends from v64i1 masks to 512-bit vectors.
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1528 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1529 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1530 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1531 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1532 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1533 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1534 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1535 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1536 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1537 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1539 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1540 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1541 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1542 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1543 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1544 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1545 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1548 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1549 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1550 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1556 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1557 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1558 setOperationAction(ISD::VSELECT, VT, Custom);
1559 setOperationAction(ISD::ABS, VT, Legal);
1560 setOperationAction(ISD::SRL, VT, Custom);
1561 setOperationAction(ISD::SHL, VT, Custom);
1562 setOperationAction(ISD::SRA, VT, Custom);
1563 setOperationAction(ISD::MLOAD, VT, Legal);
1564 setOperationAction(ISD::MSTORE, VT, Legal);
1565 setOperationAction(ISD::CTPOP, VT, Custom);
1566 setOperationAction(ISD::CTTZ, VT, Custom);
1567 setOperationAction(ISD::CTLZ, VT, Custom);
1568 setOperationAction(ISD::SMAX, VT, Legal);
1569 setOperationAction(ISD::UMAX, VT, Legal);
1570 setOperationAction(ISD::SMIN, VT, Legal);
1571 setOperationAction(ISD::UMIN, VT, Legal);
1572 setOperationAction(ISD::SETCC, VT, Custom);
1574 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1575 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1576 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1579 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1580 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1583 if (Subtarget.hasBITALG()) {
1584 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1585 setOperationAction(ISD::CTPOP, VT, Legal);
1589 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1590 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1591 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1592 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1595 // These operations are handled on non-VLX by artificially widening in
1597 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1599 if (Subtarget.hasBITALG()) {
1600 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1601 setOperationAction(ISD::CTPOP, VT, Legal);
1605 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1606 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1607 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1608 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1609 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1610 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1612 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1613 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1614 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1615 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1616 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1618 if (Subtarget.hasDQI()) {
1619 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1620 // v2f32 UINT_TO_FP is already custom under SSE2.
1621 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1622 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1623 "Unexpected operation action!");
1624 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1625 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1626 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1629 if (Subtarget.hasBWI()) {
1630 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1631 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1635 // We want to custom lower some of our intrinsics.
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1637 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1638 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1639 if (!Subtarget.is64Bit()) {
1640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1641 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1644 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1645 // handle type legalization for these operations here.
1647 // FIXME: We really should do custom legalization for addition and
1648 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1649 // than generic legalization for 64-bit multiplication-with-overflow, though.
1650 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1651 if (VT == MVT::i64 && !Subtarget.is64Bit())
1653 // Add/Sub/Mul with overflow operations are custom lowered.
1654 setOperationAction(ISD::SADDO, VT, Custom);
1655 setOperationAction(ISD::UADDO, VT, Custom);
1656 setOperationAction(ISD::SSUBO, VT, Custom);
1657 setOperationAction(ISD::USUBO, VT, Custom);
1658 setOperationAction(ISD::SMULO, VT, Custom);
1659 setOperationAction(ISD::UMULO, VT, Custom);
1661 // Support carry in as value rather than glue.
1662 setOperationAction(ISD::ADDCARRY, VT, Custom);
1663 setOperationAction(ISD::SUBCARRY, VT, Custom);
1664 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1667 if (!Subtarget.is64Bit()) {
1668 // These libcalls are not available in 32-bit.
1669 setLibcallName(RTLIB::SHL_I128, nullptr);
1670 setLibcallName(RTLIB::SRL_I128, nullptr);
1671 setLibcallName(RTLIB::SRA_I128, nullptr);
1672 setLibcallName(RTLIB::MUL_I128, nullptr);
1675 // Combine sin / cos into _sincos_stret if it is available.
1676 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1677 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1678 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1679 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1682 if (Subtarget.isTargetWin64()) {
1683 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1684 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1685 setOperationAction(ISD::SREM, MVT::i128, Custom);
1686 setOperationAction(ISD::UREM, MVT::i128, Custom);
1687 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1688 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1691 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1692 // is. We should promote the value to 64-bits to solve this.
1693 // This is what the CRT headers do - `fmodf` is an inline header
1694 // function casting to f64 and calling `fmod`.
1695 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1696 Subtarget.isTargetWindowsItanium()))
1697 for (ISD::NodeType Op :
1698 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1699 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1700 if (isOperationExpand(Op, MVT::f32))
1701 setOperationAction(Op, MVT::f32, Promote);
1703 // We have target-specific dag combine patterns for the following nodes:
1704 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1705 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1707 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1708 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1709 setTargetDAGCombine(ISD::BITCAST);
1710 setTargetDAGCombine(ISD::VSELECT);
1711 setTargetDAGCombine(ISD::SELECT);
1712 setTargetDAGCombine(ISD::SHL);
1713 setTargetDAGCombine(ISD::SRA);
1714 setTargetDAGCombine(ISD::SRL);
1715 setTargetDAGCombine(ISD::OR);
1716 setTargetDAGCombine(ISD::AND);
1717 setTargetDAGCombine(ISD::ADD);
1718 setTargetDAGCombine(ISD::FADD);
1719 setTargetDAGCombine(ISD::FSUB);
1720 setTargetDAGCombine(ISD::FNEG);
1721 setTargetDAGCombine(ISD::FMA);
1722 setTargetDAGCombine(ISD::FMINNUM);
1723 setTargetDAGCombine(ISD::FMAXNUM);
1724 setTargetDAGCombine(ISD::SUB);
1725 setTargetDAGCombine(ISD::LOAD);
1726 setTargetDAGCombine(ISD::MLOAD);
1727 setTargetDAGCombine(ISD::STORE);
1728 setTargetDAGCombine(ISD::MSTORE);
1729 setTargetDAGCombine(ISD::TRUNCATE);
1730 setTargetDAGCombine(ISD::ZERO_EXTEND);
1731 setTargetDAGCombine(ISD::ANY_EXTEND);
1732 setTargetDAGCombine(ISD::SIGN_EXTEND);
1733 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1734 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1735 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1736 setTargetDAGCombine(ISD::SINT_TO_FP);
1737 setTargetDAGCombine(ISD::UINT_TO_FP);
1738 setTargetDAGCombine(ISD::SETCC);
1739 setTargetDAGCombine(ISD::MUL);
1740 setTargetDAGCombine(ISD::XOR);
1741 setTargetDAGCombine(ISD::MSCATTER);
1742 setTargetDAGCombine(ISD::MGATHER);
1744 computeRegisterProperties(Subtarget.getRegisterInfo());
1746 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1747 MaxStoresPerMemsetOptSize = 8;
1748 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1749 MaxStoresPerMemcpyOptSize = 4;
1750 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1751 MaxStoresPerMemmoveOptSize = 4;
1753 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1754 // that needs to benchmarked and balanced with the potential use of vector
1755 // load/store types (PR33329, PR33914).
1756 MaxLoadsPerMemcmp = 2;
1757 MaxLoadsPerMemcmpOptSize = 2;
1759 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1760 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1762 // An out-of-order CPU can speculatively execute past a predictable branch,
1763 // but a conditional move could be stalled by an expensive earlier operation.
1764 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1765 EnableExtLdPromotion = true;
1766 setPrefFunctionAlignment(4); // 2^4 bytes.
1768 verifyIntrinsicTables();
1771 // This has so far only been implemented for 64-bit MachO.
1772 bool X86TargetLowering::useLoadStackGuardNode() const {
1773 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1776 bool X86TargetLowering::useStackGuardXorFP() const {
1777 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1778 return Subtarget.getTargetTriple().isOSMSVCRT();
1781 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1782 const SDLoc &DL) const {
1783 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1784 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1785 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1786 return SDValue(Node, 0);
1789 TargetLoweringBase::LegalizeTypeAction
1790 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1791 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1792 return TypeSplitVector;
1794 if (ExperimentalVectorWideningLegalization &&
1795 VT.getVectorNumElements() != 1 &&
1796 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1797 return TypeWidenVector;
1799 return TargetLoweringBase::getPreferredVectorAction(VT);
1802 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1805 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1807 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1810 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1813 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1815 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1818 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1819 LLVMContext& Context,
1824 if (Subtarget.hasAVX512()) {
1825 const unsigned NumElts = VT.getVectorNumElements();
1827 // Figure out what this type will be legalized to.
1829 while (getTypeAction(Context, LegalVT) != TypeLegal)
1830 LegalVT = getTypeToTransformTo(Context, LegalVT);
1832 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1833 if (LegalVT.getSimpleVT().is512BitVector())
1834 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1836 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1837 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1838 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1840 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1841 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1842 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1846 return VT.changeVectorElementTypeToInteger();
1849 /// Helper for getByValTypeAlignment to determine
1850 /// the desired ByVal argument alignment.
1851 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1854 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1855 if (VTy->getBitWidth() == 128)
1857 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1858 unsigned EltAlign = 0;
1859 getMaxByValAlign(ATy->getElementType(), EltAlign);
1860 if (EltAlign > MaxAlign)
1861 MaxAlign = EltAlign;
1862 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1863 for (auto *EltTy : STy->elements()) {
1864 unsigned EltAlign = 0;
1865 getMaxByValAlign(EltTy, EltAlign);
1866 if (EltAlign > MaxAlign)
1867 MaxAlign = EltAlign;
1874 /// Return the desired alignment for ByVal aggregate
1875 /// function arguments in the caller parameter area. For X86, aggregates
1876 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1877 /// are at 4-byte boundaries.
1878 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1879 const DataLayout &DL) const {
1880 if (Subtarget.is64Bit()) {
1881 // Max of 8 and alignment of type.
1882 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1889 if (Subtarget.hasSSE1())
1890 getMaxByValAlign(Ty, Align);
1894 /// Returns the target specific optimal type for load
1895 /// and store operations as a result of memset, memcpy, and memmove
1896 /// lowering. If DstAlign is zero that means it's safe to destination
1897 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1898 /// means there isn't a need to check it against alignment requirement,
1899 /// probably because the source does not need to be loaded. If 'IsMemset' is
1900 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1901 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1902 /// source is constant so it does not need to be loaded.
1903 /// It returns EVT::Other if the type should be determined using generic
1904 /// target-independent logic.
1906 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1907 unsigned DstAlign, unsigned SrcAlign,
1908 bool IsMemset, bool ZeroMemset,
1910 MachineFunction &MF) const {
1911 const Function &F = MF.getFunction();
1912 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1914 (!Subtarget.isUnalignedMem16Slow() ||
1915 ((DstAlign == 0 || DstAlign >= 16) &&
1916 (SrcAlign == 0 || SrcAlign >= 16)))) {
1917 // FIXME: Check if unaligned 32-byte accesses are slow.
1918 if (Size >= 32 && Subtarget.hasAVX()) {
1919 // Although this isn't a well-supported type for AVX1, we'll let
1920 // legalization and shuffle lowering produce the optimal codegen. If we
1921 // choose an optimal type with a vector element larger than a byte,
1922 // getMemsetStores() may create an intermediate splat (using an integer
1923 // multiply) before we splat as a vector.
1926 if (Subtarget.hasSSE2())
1928 // TODO: Can SSE1 handle a byte vector?
1929 if (Subtarget.hasSSE1())
1931 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1932 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1933 // Do not use f64 to lower memcpy if source is string constant. It's
1934 // better to use i32 to avoid the loads.
1935 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1936 // The gymnastics of splatting a byte value into an XMM register and then
1937 // only using 8-byte stores (because this is a CPU with slow unaligned
1938 // 16-byte accesses) makes that a loser.
1942 // This is a compromise. If we reach here, unaligned accesses may be slow on
1943 // this target. However, creating smaller, aligned accesses could be even
1944 // slower and would certainly be a lot more code.
1945 if (Subtarget.is64Bit() && Size >= 8)
1950 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1952 return X86ScalarSSEf32;
1953 else if (VT == MVT::f64)
1954 return X86ScalarSSEf64;
1959 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1964 switch (VT.getSizeInBits()) {
1966 // 8-byte and under are always assumed to be fast.
1970 *Fast = !Subtarget.isUnalignedMem16Slow();
1973 *Fast = !Subtarget.isUnalignedMem32Slow();
1975 // TODO: What about AVX-512 (512-bit) accesses?
1978 // Misaligned accesses of any size are always allowed.
1982 /// Return the entry encoding for a jump table in the
1983 /// current function. The returned value is a member of the
1984 /// MachineJumpTableInfo::JTEntryKind enum.
1985 unsigned X86TargetLowering::getJumpTableEncoding() const {
1986 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1988 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1989 return MachineJumpTableInfo::EK_Custom32;
1991 // Otherwise, use the normal jump table encoding heuristics.
1992 return TargetLowering::getJumpTableEncoding();
1995 bool X86TargetLowering::useSoftFloat() const {
1996 return Subtarget.useSoftFloat();
1999 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
2000 ArgListTy &Args) const {
2002 // Only relabel X86-32 for C / Stdcall CCs.
2003 if (Subtarget.is64Bit())
2005 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2007 unsigned ParamRegs = 0;
2008 if (auto *M = MF->getFunction().getParent())
2009 ParamRegs = M->getNumberRegisterParameters();
2011 // Mark the first N int arguments as having reg
2012 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2013 Type *T = Args[Idx].Ty;
2014 if (T->isIntOrPtrTy())
2015 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2016 unsigned numRegs = 1;
2017 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2019 if (ParamRegs < numRegs)
2021 ParamRegs -= numRegs;
2022 Args[Idx].IsInReg = true;
2028 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2029 const MachineBasicBlock *MBB,
2030 unsigned uid,MCContext &Ctx) const{
2031 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2032 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2034 return MCSymbolRefExpr::create(MBB->getSymbol(),
2035 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2038 /// Returns relocation base for the given PIC jumptable.
2039 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2040 SelectionDAG &DAG) const {
2041 if (!Subtarget.is64Bit())
2042 // This doesn't have SDLoc associated with it, but is not really the
2043 // same as a Register.
2044 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2045 getPointerTy(DAG.getDataLayout()));
2049 /// This returns the relocation base for the given PIC jumptable,
2050 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2051 const MCExpr *X86TargetLowering::
2052 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2053 MCContext &Ctx) const {
2054 // X86-64 uses RIP relative addressing based on the jump table label.
2055 if (Subtarget.isPICStyleRIPRel())
2056 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2058 // Otherwise, the reference is relative to the PIC base.
2059 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2062 std::pair<const TargetRegisterClass *, uint8_t>
2063 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2065 const TargetRegisterClass *RRC = nullptr;
2067 switch (VT.SimpleTy) {
2069 return TargetLowering::findRepresentativeClass(TRI, VT);
2070 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2071 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2074 RRC = &X86::VR64RegClass;
2076 case MVT::f32: case MVT::f64:
2077 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2078 case MVT::v4f32: case MVT::v2f64:
2079 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2080 case MVT::v8f32: case MVT::v4f64:
2081 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2082 case MVT::v16f32: case MVT::v8f64:
2083 RRC = &X86::VR128XRegClass;
2086 return std::make_pair(RRC, Cost);
2089 unsigned X86TargetLowering::getAddressSpace() const {
2090 if (Subtarget.is64Bit())
2091 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2095 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2096 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2097 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2100 static Constant* SegmentOffset(IRBuilder<> &IRB,
2101 unsigned Offset, unsigned AddressSpace) {
2102 return ConstantExpr::getIntToPtr(
2103 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2104 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2107 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2108 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2109 // tcbhead_t; use it instead of the usual global variable (see
2110 // sysdeps/{i386,x86_64}/nptl/tls.h)
2111 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2112 if (Subtarget.isTargetFuchsia()) {
2113 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2114 return SegmentOffset(IRB, 0x10, getAddressSpace());
2116 // %fs:0x28, unless we're using a Kernel code model, in which case
2117 // it's %gs:0x28. gs:0x14 on i386.
2118 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2119 return SegmentOffset(IRB, Offset, getAddressSpace());
2123 return TargetLowering::getIRStackGuard(IRB);
2126 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2127 // MSVC CRT provides functionalities for stack protection.
2128 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2129 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2130 // MSVC CRT has a global variable holding security cookie.
2131 M.getOrInsertGlobal("__security_cookie",
2132 Type::getInt8PtrTy(M.getContext()));
2134 // MSVC CRT has a function to validate security cookie.
2135 auto *SecurityCheckCookie = cast<Function>(
2136 M.getOrInsertFunction("__security_check_cookie",
2137 Type::getVoidTy(M.getContext()),
2138 Type::getInt8PtrTy(M.getContext())));
2139 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2140 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2143 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2144 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2146 TargetLowering::insertSSPDeclarations(M);
2149 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2150 // MSVC CRT has a global variable holding security cookie.
2151 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2152 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2153 return M.getGlobalVariable("__security_cookie");
2155 return TargetLowering::getSDagStackGuard(M);
2158 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2159 // MSVC CRT has a function to validate security cookie.
2160 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2161 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2162 return M.getFunction("__security_check_cookie");
2164 return TargetLowering::getSSPStackGuardCheck(M);
2167 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2168 if (Subtarget.getTargetTriple().isOSContiki())
2169 return getDefaultSafeStackPointerLocation(IRB, false);
2171 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2172 // definition of TLS_SLOT_SAFESTACK in
2173 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2174 if (Subtarget.isTargetAndroid()) {
2175 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2177 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2178 return SegmentOffset(IRB, Offset, getAddressSpace());
2181 // Fuchsia is similar.
2182 if (Subtarget.isTargetFuchsia()) {
2183 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2184 return SegmentOffset(IRB, 0x18, getAddressSpace());
2187 return TargetLowering::getSafeStackPointerLocation(IRB);
2190 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2191 unsigned DestAS) const {
2192 assert(SrcAS != DestAS && "Expected different address spaces!");
2194 return SrcAS < 256 && DestAS < 256;
2197 //===----------------------------------------------------------------------===//
2198 // Return Value Calling Convention Implementation
2199 //===----------------------------------------------------------------------===//
2201 #include "X86GenCallingConv.inc"
2203 bool X86TargetLowering::CanLowerReturn(
2204 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2205 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2206 SmallVector<CCValAssign, 16> RVLocs;
2207 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2208 return CCInfo.CheckReturn(Outs, RetCC_X86);
2211 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2212 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2216 /// Lowers masks values (v*i1) to the local register values
2217 /// \returns DAG node after lowering to register type
2218 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2219 const SDLoc &Dl, SelectionDAG &DAG) {
2220 EVT ValVT = ValArg.getValueType();
2222 if (ValVT == MVT::v1i1)
2223 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2224 DAG.getIntPtrConstant(0, Dl));
2226 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2227 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2228 // Two stage lowering might be required
2229 // bitcast: v8i1 -> i8 / v16i1 -> i16
2230 // anyextend: i8 -> i32 / i16 -> i32
2231 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2232 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2233 if (ValLoc == MVT::i32)
2234 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2238 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2239 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2240 // One stage lowering is required
2241 // bitcast: v32i1 -> i32 / v64i1 -> i64
2242 return DAG.getBitcast(ValLoc, ValArg);
2245 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2248 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2249 static void Passv64i1ArgInRegs(
2250 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2251 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2252 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2253 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2254 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2255 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2256 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2257 "The value should reside in two registers");
2259 // Before splitting the value we cast it to i64
2260 Arg = DAG.getBitcast(MVT::i64, Arg);
2262 // Splitting the value into two i32 types
2264 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2265 DAG.getConstant(0, Dl, MVT::i32));
2266 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2267 DAG.getConstant(1, Dl, MVT::i32));
2269 // Attach the two i32 types into corresponding registers
2270 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2271 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2275 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2277 const SmallVectorImpl<ISD::OutputArg> &Outs,
2278 const SmallVectorImpl<SDValue> &OutVals,
2279 const SDLoc &dl, SelectionDAG &DAG) const {
2280 MachineFunction &MF = DAG.getMachineFunction();
2281 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2283 // In some cases we need to disable registers from the default CSR list.
2284 // For example, when they are used for argument passing.
2285 bool ShouldDisableCalleeSavedRegister =
2286 CallConv == CallingConv::X86_RegCall ||
2287 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2289 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2290 report_fatal_error("X86 interrupts may not return any value");
2292 SmallVector<CCValAssign, 16> RVLocs;
2293 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2294 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2297 SmallVector<SDValue, 6> RetOps;
2298 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2299 // Operand #1 = Bytes To Pop
2300 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2303 // Copy the result values into the output registers.
2304 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2306 CCValAssign &VA = RVLocs[I];
2307 assert(VA.isRegLoc() && "Can only return in registers!");
2309 // Add the register to the CalleeSaveDisableRegs list.
2310 if (ShouldDisableCalleeSavedRegister)
2311 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2313 SDValue ValToCopy = OutVals[OutsIndex];
2314 EVT ValVT = ValToCopy.getValueType();
2316 // Promote values to the appropriate types.
2317 if (VA.getLocInfo() == CCValAssign::SExt)
2318 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2319 else if (VA.getLocInfo() == CCValAssign::ZExt)
2320 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2321 else if (VA.getLocInfo() == CCValAssign::AExt) {
2322 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2323 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2325 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2327 else if (VA.getLocInfo() == CCValAssign::BCvt)
2328 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2330 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2331 "Unexpected FP-extend for return value.");
2333 // If this is x86-64, and we disabled SSE, we can't return FP values,
2334 // or SSE or MMX vectors.
2335 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2336 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2337 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2338 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2339 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2340 } else if (ValVT == MVT::f64 &&
2341 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2342 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2343 // llvm-gcc has never done it right and no one has noticed, so this
2344 // should be OK for now.
2345 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2346 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2349 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2350 // the RET instruction and handled by the FP Stackifier.
2351 if (VA.getLocReg() == X86::FP0 ||
2352 VA.getLocReg() == X86::FP1) {
2353 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2354 // change the value to the FP stack register class.
2355 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2356 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2357 RetOps.push_back(ValToCopy);
2358 // Don't emit a copytoreg.
2362 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2363 // which is returned in RAX / RDX.
2364 if (Subtarget.is64Bit()) {
2365 if (ValVT == MVT::x86mmx) {
2366 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2367 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2368 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2370 // If we don't have SSE2 available, convert to v4f32 so the generated
2371 // register is legal.
2372 if (!Subtarget.hasSSE2())
2373 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2378 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2380 if (VA.needsCustom()) {
2381 assert(VA.getValVT() == MVT::v64i1 &&
2382 "Currently the only custom case is when we split v64i1 to 2 regs");
2384 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2387 assert(2 == RegsToPass.size() &&
2388 "Expecting two registers after Pass64BitArgInRegs");
2390 // Add the second register to the CalleeSaveDisableRegs list.
2391 if (ShouldDisableCalleeSavedRegister)
2392 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2394 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2397 // Add nodes to the DAG and add the values into the RetOps list
2398 for (auto &Reg : RegsToPass) {
2399 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2400 Flag = Chain.getValue(1);
2401 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2405 // Swift calling convention does not require we copy the sret argument
2406 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2408 // All x86 ABIs require that for returning structs by value we copy
2409 // the sret argument into %rax/%eax (depending on ABI) for the return.
2410 // We saved the argument into a virtual register in the entry block,
2411 // so now we copy the value out and into %rax/%eax.
2413 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2414 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2415 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2416 // either case FuncInfo->setSRetReturnReg() will have been called.
2417 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2418 // When we have both sret and another return value, we should use the
2419 // original Chain stored in RetOps[0], instead of the current Chain updated
2420 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2422 // For the case of sret and another return value, we have
2423 // Chain_0 at the function entry
2424 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2425 // If we use Chain_1 in getCopyFromReg, we will have
2426 // Val = getCopyFromReg(Chain_1)
2427 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2429 // getCopyToReg(Chain_0) will be glued together with
2430 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2431 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2432 // Data dependency from Unit B to Unit A due to usage of Val in
2433 // getCopyToReg(Chain_1, Val)
2434 // Chain dependency from Unit A to Unit B
2436 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2437 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2438 getPointerTy(MF.getDataLayout()));
2441 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2442 X86::RAX : X86::EAX;
2443 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2444 Flag = Chain.getValue(1);
2446 // RAX/EAX now acts like a return value.
2448 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2450 // Add the returned register to the CalleeSaveDisableRegs list.
2451 if (ShouldDisableCalleeSavedRegister)
2452 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2455 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2456 const MCPhysReg *I =
2457 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2460 if (X86::GR64RegClass.contains(*I))
2461 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2463 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2467 RetOps[0] = Chain; // Update chain.
2469 // Add the flag if we have it.
2471 RetOps.push_back(Flag);
2473 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2474 if (CallConv == CallingConv::X86_INTR)
2475 opcode = X86ISD::IRET;
2476 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2479 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2480 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2483 SDValue TCChain = Chain;
2484 SDNode *Copy = *N->use_begin();
2485 if (Copy->getOpcode() == ISD::CopyToReg) {
2486 // If the copy has a glue operand, we conservatively assume it isn't safe to
2487 // perform a tail call.
2488 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2490 TCChain = Copy->getOperand(0);
2491 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2494 bool HasRet = false;
2495 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2497 if (UI->getOpcode() != X86ISD::RET_FLAG)
2499 // If we are returning more than one value, we can definitely
2500 // not make a tail call see PR19530
2501 if (UI->getNumOperands() > 4)
2503 if (UI->getNumOperands() == 4 &&
2504 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2516 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2517 ISD::NodeType ExtendKind) const {
2518 MVT ReturnMVT = MVT::i32;
2520 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2521 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2522 // The ABI does not require i1, i8 or i16 to be extended.
2524 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2525 // always extending i8/i16 return values, so keep doing that for now.
2527 ReturnMVT = MVT::i8;
2530 EVT MinVT = getRegisterType(Context, ReturnMVT);
2531 return VT.bitsLT(MinVT) ? MinVT : VT;
2534 /// Reads two 32 bit registers and creates a 64 bit mask value.
2535 /// \param VA The current 32 bit value that need to be assigned.
2536 /// \param NextVA The next 32 bit value that need to be assigned.
2537 /// \param Root The parent DAG node.
2538 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2539 /// glue purposes. In the case the DAG is already using
2540 /// physical register instead of virtual, we should glue
2541 /// our new SDValue to InFlag SDvalue.
2542 /// \return a new SDvalue of size 64bit.
2543 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2544 SDValue &Root, SelectionDAG &DAG,
2545 const SDLoc &Dl, const X86Subtarget &Subtarget,
2546 SDValue *InFlag = nullptr) {
2547 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2548 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2549 assert(VA.getValVT() == MVT::v64i1 &&
2550 "Expecting first location of 64 bit width type");
2551 assert(NextVA.getValVT() == VA.getValVT() &&
2552 "The locations should have the same type");
2553 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2554 "The values should reside in two registers");
2558 SDValue ArgValueLo, ArgValueHi;
2560 MachineFunction &MF = DAG.getMachineFunction();
2561 const TargetRegisterClass *RC = &X86::GR32RegClass;
2563 // Read a 32 bit value from the registers.
2564 if (nullptr == InFlag) {
2565 // When no physical register is present,
2566 // create an intermediate virtual register.
2567 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2568 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2569 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2570 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2572 // When a physical register is available read the value from it and glue
2573 // the reads together.
2575 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2576 *InFlag = ArgValueLo.getValue(2);
2578 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2579 *InFlag = ArgValueHi.getValue(2);
2582 // Convert the i32 type into v32i1 type.
2583 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2585 // Convert the i32 type into v32i1 type.
2586 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2588 // Concatenate the two values together.
2589 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2592 /// The function will lower a register of various sizes (8/16/32/64)
2593 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2594 /// \returns a DAG node contains the operand after lowering to mask type.
2595 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2596 const EVT &ValLoc, const SDLoc &Dl,
2597 SelectionDAG &DAG) {
2598 SDValue ValReturned = ValArg;
2600 if (ValVT == MVT::v1i1)
2601 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2603 if (ValVT == MVT::v64i1) {
2604 // In 32 bit machine, this case is handled by getv64i1Argument
2605 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2606 // In 64 bit machine, There is no need to truncate the value only bitcast
2609 switch (ValVT.getSimpleVT().SimpleTy) {
2620 llvm_unreachable("Expecting a vector of i1 types");
2623 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2625 return DAG.getBitcast(ValVT, ValReturned);
2628 /// Lower the result values of a call into the
2629 /// appropriate copies out of appropriate physical registers.
2631 SDValue X86TargetLowering::LowerCallResult(
2632 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2633 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2634 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2635 uint32_t *RegMask) const {
2637 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2638 // Assign locations to each value returned by this call.
2639 SmallVector<CCValAssign, 16> RVLocs;
2640 bool Is64Bit = Subtarget.is64Bit();
2641 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2643 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2645 // Copy all of the result registers out of their specified physreg.
2646 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2648 CCValAssign &VA = RVLocs[I];
2649 EVT CopyVT = VA.getLocVT();
2651 // In some calling conventions we need to remove the used registers
2652 // from the register mask.
2654 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2655 SubRegs.isValid(); ++SubRegs)
2656 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2659 // If this is x86-64, and we disabled SSE, we can't return FP values
2660 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2661 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2662 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2663 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2666 // If we prefer to use the value in xmm registers, copy it out as f80 and
2667 // use a truncate to move it from fp stack reg to xmm reg.
2668 bool RoundAfterCopy = false;
2669 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2670 isScalarFPTypeInSSEReg(VA.getValVT())) {
2671 if (!Subtarget.hasX87())
2672 report_fatal_error("X87 register return with X87 disabled");
2674 RoundAfterCopy = (CopyVT != VA.getLocVT());
2678 if (VA.needsCustom()) {
2679 assert(VA.getValVT() == MVT::v64i1 &&
2680 "Currently the only custom case is when we split v64i1 to 2 regs");
2682 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2684 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2686 Val = Chain.getValue(0);
2687 InFlag = Chain.getValue(2);
2691 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2692 // This truncation won't change the value.
2693 DAG.getIntPtrConstant(1, dl));
2695 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2696 if (VA.getValVT().isVector() &&
2697 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2698 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2699 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2700 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2702 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2705 InVals.push_back(Val);
2711 //===----------------------------------------------------------------------===//
2712 // C & StdCall & Fast Calling Convention implementation
2713 //===----------------------------------------------------------------------===//
2714 // StdCall calling convention seems to be standard for many Windows' API
2715 // routines and around. It differs from C calling convention just a little:
2716 // callee should clean up the stack, not caller. Symbols should be also
2717 // decorated in some fancy way :) It doesn't support any vector arguments.
2718 // For info on fast calling convention see Fast Calling Convention (tail call)
2719 // implementation LowerX86_32FastCCCallTo.
2721 /// CallIsStructReturn - Determines whether a call uses struct return
2723 enum StructReturnType {
2728 static StructReturnType
2729 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2731 return NotStructReturn;
2733 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2734 if (!Flags.isSRet())
2735 return NotStructReturn;
2736 if (Flags.isInReg() || IsMCU)
2737 return RegStructReturn;
2738 return StackStructReturn;
2741 /// Determines whether a function uses struct return semantics.
2742 static StructReturnType
2743 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2745 return NotStructReturn;
2747 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2748 if (!Flags.isSRet())
2749 return NotStructReturn;
2750 if (Flags.isInReg() || IsMCU)
2751 return RegStructReturn;
2752 return StackStructReturn;
2755 /// Make a copy of an aggregate at address specified by "Src" to address
2756 /// "Dst" with size and alignment information specified by the specific
2757 /// parameter attribute. The copy will be passed as a byval function parameter.
2758 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2759 SDValue Chain, ISD::ArgFlagsTy Flags,
2760 SelectionDAG &DAG, const SDLoc &dl) {
2761 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2763 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2764 /*isVolatile*/false, /*AlwaysInline=*/true,
2765 /*isTailCall*/false,
2766 MachinePointerInfo(), MachinePointerInfo());
2769 /// Return true if the calling convention is one that we can guarantee TCO for.
2770 static bool canGuaranteeTCO(CallingConv::ID CC) {
2771 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2772 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2773 CC == CallingConv::HHVM);
2776 /// Return true if we might ever do TCO for calls with this calling convention.
2777 static bool mayTailCallThisCC(CallingConv::ID CC) {
2779 // C calling conventions:
2780 case CallingConv::C:
2781 case CallingConv::Win64:
2782 case CallingConv::X86_64_SysV:
2783 // Callee pop conventions:
2784 case CallingConv::X86_ThisCall:
2785 case CallingConv::X86_StdCall:
2786 case CallingConv::X86_VectorCall:
2787 case CallingConv::X86_FastCall:
2790 return canGuaranteeTCO(CC);
2794 /// Return true if the function is being made into a tailcall target by
2795 /// changing its ABI.
2796 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2797 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2800 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2802 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2803 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2806 ImmutableCallSite CS(CI);
2807 CallingConv::ID CalleeCC = CS.getCallingConv();
2808 if (!mayTailCallThisCC(CalleeCC))
2815 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2816 const SmallVectorImpl<ISD::InputArg> &Ins,
2817 const SDLoc &dl, SelectionDAG &DAG,
2818 const CCValAssign &VA,
2819 MachineFrameInfo &MFI, unsigned i) const {
2820 // Create the nodes corresponding to a load from this parameter slot.
2821 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2822 bool AlwaysUseMutable = shouldGuaranteeTCO(
2823 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2824 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2826 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2828 // If value is passed by pointer we have address passed instead of the value
2829 // itself. No need to extend if the mask value and location share the same
2831 bool ExtendedInMem =
2832 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2833 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2835 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2836 ValVT = VA.getLocVT();
2838 ValVT = VA.getValVT();
2840 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2841 // taken by a return address.
2843 if (CallConv == CallingConv::X86_INTR) {
2844 // X86 interrupts may take one or two arguments.
2845 // On the stack there will be no return address as in regular call.
2846 // Offset of last argument need to be set to -4/-8 bytes.
2847 // Where offset of the first argument out of two, should be set to 0 bytes.
2848 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2849 if (Subtarget.is64Bit() && Ins.size() == 2) {
2850 // The stack pointer needs to be realigned for 64 bit handlers with error
2851 // code, so the argument offset changes by 8 bytes.
2856 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2857 // changed with more analysis.
2858 // In case of tail call optimization mark all arguments mutable. Since they
2859 // could be overwritten by lowering of arguments in case of a tail call.
2860 if (Flags.isByVal()) {
2861 unsigned Bytes = Flags.getByValSize();
2862 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2864 // FIXME: For now, all byval parameter objects are marked as aliasing. This
2865 // can be improved with deeper analysis.
2866 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
2867 /*isAliased=*/true);
2868 // Adjust SP offset of interrupt parameter.
2869 if (CallConv == CallingConv::X86_INTR) {
2870 MFI.setObjectOffset(FI, Offset);
2872 return DAG.getFrameIndex(FI, PtrVT);
2875 // This is an argument in memory. We might be able to perform copy elision.
2876 if (Flags.isCopyElisionCandidate()) {
2877 EVT ArgVT = Ins[i].ArgVT;
2879 if (Ins[i].PartOffset == 0) {
2880 // If this is a one-part value or the first part of a multi-part value,
2881 // create a stack object for the entire argument value type and return a
2882 // load from our portion of it. This assumes that if the first part of an
2883 // argument is in memory, the rest will also be in memory.
2884 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2885 /*Immutable=*/false);
2886 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2888 ValVT, dl, Chain, PartAddr,
2889 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2891 // This is not the first piece of an argument in memory. See if there is
2892 // already a fixed stack object including this offset. If so, assume it
2893 // was created by the PartOffset == 0 branch above and create a load from
2894 // the appropriate offset into it.
2895 int64_t PartBegin = VA.getLocMemOffset();
2896 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2897 int FI = MFI.getObjectIndexBegin();
2898 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2899 int64_t ObjBegin = MFI.getObjectOffset(FI);
2900 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2901 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2904 if (MFI.isFixedObjectIndex(FI)) {
2906 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2907 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2909 ValVT, dl, Chain, Addr,
2910 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2911 Ins[i].PartOffset));
2916 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2917 VA.getLocMemOffset(), isImmutable);
2919 // Set SExt or ZExt flag.
2920 if (VA.getLocInfo() == CCValAssign::ZExt) {
2921 MFI.setObjectZExt(FI, true);
2922 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2923 MFI.setObjectSExt(FI, true);
2926 // Adjust SP offset of interrupt parameter.
2927 if (CallConv == CallingConv::X86_INTR) {
2928 MFI.setObjectOffset(FI, Offset);
2931 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2932 SDValue Val = DAG.getLoad(
2933 ValVT, dl, Chain, FIN,
2934 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2935 return ExtendedInMem
2936 ? (VA.getValVT().isVector()
2937 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2938 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2942 // FIXME: Get this from tablegen.
2943 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2944 const X86Subtarget &Subtarget) {
2945 assert(Subtarget.is64Bit());
2947 if (Subtarget.isCallingConvWin64(CallConv)) {
2948 static const MCPhysReg GPR64ArgRegsWin64[] = {
2949 X86::RCX, X86::RDX, X86::R8, X86::R9
2951 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2954 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2955 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2957 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2960 // FIXME: Get this from tablegen.
2961 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2962 CallingConv::ID CallConv,
2963 const X86Subtarget &Subtarget) {
2964 assert(Subtarget.is64Bit());
2965 if (Subtarget.isCallingConvWin64(CallConv)) {
2966 // The XMM registers which might contain var arg parameters are shadowed
2967 // in their paired GPR. So we only need to save the GPR to their home
2969 // TODO: __vectorcall will change this.
2973 const Function &F = MF.getFunction();
2974 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2975 bool isSoftFloat = Subtarget.useSoftFloat();
2976 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2977 "SSE register cannot be used when SSE is disabled!");
2978 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2979 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2983 static const MCPhysReg XMMArgRegs64Bit[] = {
2984 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2985 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2987 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2991 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
2992 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2993 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2994 return A.getValNo() < B.getValNo();
2999 SDValue X86TargetLowering::LowerFormalArguments(
3000 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3001 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3002 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3003 MachineFunction &MF = DAG.getMachineFunction();
3004 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3005 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3007 const Function &F = MF.getFunction();
3008 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3009 F.getName() == "main")
3010 FuncInfo->setForceFramePointer(true);
3012 MachineFrameInfo &MFI = MF.getFrameInfo();
3013 bool Is64Bit = Subtarget.is64Bit();
3014 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3017 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3018 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3020 if (CallConv == CallingConv::X86_INTR) {
3021 bool isLegal = Ins.size() == 1 ||
3022 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3023 (!Is64Bit && Ins[1].VT == MVT::i32)));
3025 report_fatal_error("X86 interrupts may take one or two arguments");
3028 // Assign locations to all of the incoming arguments.
3029 SmallVector<CCValAssign, 16> ArgLocs;
3030 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3032 // Allocate shadow area for Win64.
3034 CCInfo.AllocateStack(32, 8);
3036 CCInfo.AnalyzeArguments(Ins, CC_X86);
3038 // In vectorcall calling convention a second pass is required for the HVA
3040 if (CallingConv::X86_VectorCall == CallConv) {
3041 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3044 // The next loop assumes that the locations are in the same order of the
3046 assert(isSortedByValueNo(ArgLocs) &&
3047 "Argument Location list must be sorted before lowering");
3050 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3052 assert(InsIndex < Ins.size() && "Invalid Ins index");
3053 CCValAssign &VA = ArgLocs[I];
3055 if (VA.isRegLoc()) {
3056 EVT RegVT = VA.getLocVT();
3057 if (VA.needsCustom()) {
3059 VA.getValVT() == MVT::v64i1 &&
3060 "Currently the only custom case is when we split v64i1 to 2 regs");
3062 // v64i1 values, in regcall calling convention, that are
3063 // compiled to 32 bit arch, are split up into two registers.
3065 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3067 const TargetRegisterClass *RC;
3068 if (RegVT == MVT::i8)
3069 RC = &X86::GR8RegClass;
3070 else if (RegVT == MVT::i16)
3071 RC = &X86::GR16RegClass;
3072 else if (RegVT == MVT::i32)
3073 RC = &X86::GR32RegClass;
3074 else if (Is64Bit && RegVT == MVT::i64)
3075 RC = &X86::GR64RegClass;
3076 else if (RegVT == MVT::f32)
3077 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3078 else if (RegVT == MVT::f64)
3079 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3080 else if (RegVT == MVT::f80)
3081 RC = &X86::RFP80RegClass;
3082 else if (RegVT == MVT::f128)
3083 RC = &X86::VR128RegClass;
3084 else if (RegVT.is512BitVector())
3085 RC = &X86::VR512RegClass;
3086 else if (RegVT.is256BitVector())
3087 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3088 else if (RegVT.is128BitVector())
3089 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3090 else if (RegVT == MVT::x86mmx)
3091 RC = &X86::VR64RegClass;
3092 else if (RegVT == MVT::v1i1)
3093 RC = &X86::VK1RegClass;
3094 else if (RegVT == MVT::v8i1)
3095 RC = &X86::VK8RegClass;
3096 else if (RegVT == MVT::v16i1)
3097 RC = &X86::VK16RegClass;
3098 else if (RegVT == MVT::v32i1)
3099 RC = &X86::VK32RegClass;
3100 else if (RegVT == MVT::v64i1)
3101 RC = &X86::VK64RegClass;
3103 llvm_unreachable("Unknown argument type!");
3105 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3106 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3109 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3110 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3112 if (VA.getLocInfo() == CCValAssign::SExt)
3113 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3114 DAG.getValueType(VA.getValVT()));
3115 else if (VA.getLocInfo() == CCValAssign::ZExt)
3116 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3117 DAG.getValueType(VA.getValVT()));
3118 else if (VA.getLocInfo() == CCValAssign::BCvt)
3119 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3121 if (VA.isExtInLoc()) {
3122 // Handle MMX values passed in XMM regs.
3123 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3124 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3125 else if (VA.getValVT().isVector() &&
3126 VA.getValVT().getScalarType() == MVT::i1 &&
3127 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3128 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3129 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3130 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3132 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3135 assert(VA.isMemLoc());
3137 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3140 // If value is passed via pointer - do a load.
3141 if (VA.getLocInfo() == CCValAssign::Indirect)
3143 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3145 InVals.push_back(ArgValue);
3148 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3149 // Swift calling convention does not require we copy the sret argument
3150 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3151 if (CallConv == CallingConv::Swift)
3154 // All x86 ABIs require that for returning structs by value we copy the
3155 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3156 // the argument into a virtual register so that we can access it from the
3158 if (Ins[I].Flags.isSRet()) {
3159 unsigned Reg = FuncInfo->getSRetReturnReg();
3161 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3162 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3163 FuncInfo->setSRetReturnReg(Reg);
3165 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3166 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3171 unsigned StackSize = CCInfo.getNextStackOffset();
3172 // Align stack specially for tail calls.
3173 if (shouldGuaranteeTCO(CallConv,
3174 MF.getTarget().Options.GuaranteedTailCallOpt))
3175 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3177 // If the function takes variable number of arguments, make a frame index for
3178 // the start of the first vararg value... for expansion of llvm.va_start. We
3179 // can skip this if there are no va_start calls.
3180 if (MFI.hasVAStart() &&
3181 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3182 CallConv != CallingConv::X86_ThisCall))) {
3183 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3186 // Figure out if XMM registers are in use.
3187 assert(!(Subtarget.useSoftFloat() &&
3188 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3189 "SSE register cannot be used when SSE is disabled!");
3191 // 64-bit calling conventions support varargs and register parameters, so we
3192 // have to do extra work to spill them in the prologue.
3193 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3194 // Find the first unallocated argument registers.
3195 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3196 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3197 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3198 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3199 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3200 "SSE register cannot be used when SSE is disabled!");
3202 // Gather all the live in physical registers.
3203 SmallVector<SDValue, 6> LiveGPRs;
3204 SmallVector<SDValue, 8> LiveXMMRegs;
3206 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3207 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3209 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3211 if (!ArgXMMs.empty()) {
3212 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3213 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3214 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3215 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3216 LiveXMMRegs.push_back(
3217 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3222 // Get to the caller-allocated home save location. Add 8 to account
3223 // for the return address.
3224 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3225 FuncInfo->setRegSaveFrameIndex(
3226 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3227 // Fixup to set vararg frame on shadow area (4 x i64).
3229 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3231 // For X86-64, if there are vararg parameters that are passed via
3232 // registers, then we must store them to their spots on the stack so
3233 // they may be loaded by dereferencing the result of va_next.
3234 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3235 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3236 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3237 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3240 // Store the integer parameter registers.
3241 SmallVector<SDValue, 8> MemOps;
3242 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3243 getPointerTy(DAG.getDataLayout()));
3244 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3245 for (SDValue Val : LiveGPRs) {
3246 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3247 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3249 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3250 MachinePointerInfo::getFixedStack(
3251 DAG.getMachineFunction(),
3252 FuncInfo->getRegSaveFrameIndex(), Offset));
3253 MemOps.push_back(Store);
3257 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3258 // Now store the XMM (fp + vector) parameter registers.
3259 SmallVector<SDValue, 12> SaveXMMOps;
3260 SaveXMMOps.push_back(Chain);
3261 SaveXMMOps.push_back(ALVal);
3262 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3263 FuncInfo->getRegSaveFrameIndex(), dl));
3264 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3265 FuncInfo->getVarArgsFPOffset(), dl));
3266 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3268 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3269 MVT::Other, SaveXMMOps));
3272 if (!MemOps.empty())
3273 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3276 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3277 // Find the largest legal vector type.
3278 MVT VecVT = MVT::Other;
3279 // FIXME: Only some x86_32 calling conventions support AVX512.
3280 if (Subtarget.hasAVX512() &&
3281 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3282 CallConv == CallingConv::Intel_OCL_BI)))
3283 VecVT = MVT::v16f32;
3284 else if (Subtarget.hasAVX())
3286 else if (Subtarget.hasSSE2())
3289 // We forward some GPRs and some vector types.
3290 SmallVector<MVT, 2> RegParmTypes;
3291 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3292 RegParmTypes.push_back(IntVT);
3293 if (VecVT != MVT::Other)
3294 RegParmTypes.push_back(VecVT);
3296 // Compute the set of forwarded registers. The rest are scratch.
3297 SmallVectorImpl<ForwardedRegister> &Forwards =
3298 FuncInfo->getForwardedMustTailRegParms();
3299 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3301 // Conservatively forward AL on x86_64, since it might be used for varargs.
3302 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3303 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3304 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3307 // Copy all forwards from physical to virtual registers.
3308 for (ForwardedRegister &F : Forwards) {
3309 // FIXME: Can we use a less constrained schedule?
3310 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3311 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3312 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3316 // Some CCs need callee pop.
3317 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3318 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3319 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3320 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3321 // X86 interrupts must pop the error code (and the alignment padding) if
3323 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3325 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3326 // If this is an sret function, the return should pop the hidden pointer.
3327 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3328 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3329 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3330 FuncInfo->setBytesToPopOnReturn(4);
3334 // RegSaveFrameIndex is X86-64 only.
3335 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3336 if (CallConv == CallingConv::X86_FastCall ||
3337 CallConv == CallingConv::X86_ThisCall)
3338 // fastcc functions can't have varargs.
3339 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3342 FuncInfo->setArgumentStackSize(StackSize);
3344 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3345 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3346 if (Personality == EHPersonality::CoreCLR) {
3348 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3349 // that we'd prefer this slot be allocated towards the bottom of the frame
3350 // (i.e. near the stack pointer after allocating the frame). Every
3351 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3352 // offset from the bottom of this and each funclet's frame must be the
3353 // same, so the size of funclets' (mostly empty) frames is dictated by
3354 // how far this slot is from the bottom (since they allocate just enough
3355 // space to accommodate holding this slot at the correct offset).
3356 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3357 EHInfo->PSPSymFrameIdx = PSPSymFI;
3361 if (CallConv == CallingConv::X86_RegCall ||
3362 F.hasFnAttribute("no_caller_saved_registers")) {
3363 MachineRegisterInfo &MRI = MF.getRegInfo();
3364 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3365 MRI.disableCalleeSavedRegister(Pair.first);
3371 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3372 SDValue Arg, const SDLoc &dl,
3374 const CCValAssign &VA,
3375 ISD::ArgFlagsTy Flags) const {
3376 unsigned LocMemOffset = VA.getLocMemOffset();
3377 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3378 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3380 if (Flags.isByVal())
3381 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3383 return DAG.getStore(
3384 Chain, dl, Arg, PtrOff,
3385 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3388 /// Emit a load of return address if tail call
3389 /// optimization is performed and it is required.
3390 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3391 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3392 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3393 // Adjust the Return address stack slot.
3394 EVT VT = getPointerTy(DAG.getDataLayout());
3395 OutRetAddr = getReturnAddressFrameIndex(DAG);
3397 // Load the "old" Return address.
3398 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3399 return SDValue(OutRetAddr.getNode(), 1);
3402 /// Emit a store of the return address if tail call
3403 /// optimization is performed and it is required (FPDiff!=0).
3404 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3405 SDValue Chain, SDValue RetAddrFrIdx,
3406 EVT PtrVT, unsigned SlotSize,
3407 int FPDiff, const SDLoc &dl) {
3408 // Store the return address to the appropriate stack slot.
3409 if (!FPDiff) return Chain;
3410 // Calculate the new stack slot for the return address.
3411 int NewReturnAddrFI =
3412 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3414 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3415 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3416 MachinePointerInfo::getFixedStack(
3417 DAG.getMachineFunction(), NewReturnAddrFI));
3421 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3422 /// operation of specified width.
3423 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3425 unsigned NumElems = VT.getVectorNumElements();
3426 SmallVector<int, 8> Mask;
3427 Mask.push_back(NumElems);
3428 for (unsigned i = 1; i != NumElems; ++i)
3430 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3434 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3435 SmallVectorImpl<SDValue> &InVals) const {
3436 SelectionDAG &DAG = CLI.DAG;
3438 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3439 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3440 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3441 SDValue Chain = CLI.Chain;
3442 SDValue Callee = CLI.Callee;
3443 CallingConv::ID CallConv = CLI.CallConv;
3444 bool &isTailCall = CLI.IsTailCall;
3445 bool isVarArg = CLI.IsVarArg;
3447 MachineFunction &MF = DAG.getMachineFunction();
3448 bool Is64Bit = Subtarget.is64Bit();
3449 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3450 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3451 bool IsSibcall = false;
3452 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3453 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3454 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3455 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3456 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3457 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3458 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3460 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3461 const Module *M = MF.getMMI().getModule();
3462 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3464 if (CallConv == CallingConv::X86_INTR)
3465 report_fatal_error("X86 interrupts may not be called directly");
3467 if (Attr.getValueAsString() == "true")
3470 if (Subtarget.isPICStyleGOT() &&
3471 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3472 // If we are using a GOT, disable tail calls to external symbols with
3473 // default visibility. Tail calling such a symbol requires using a GOT
3474 // relocation, which forces early binding of the symbol. This breaks code
3475 // that require lazy function symbol resolution. Using musttail or
3476 // GuaranteedTailCallOpt will override this.
3477 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3478 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3479 G->getGlobal()->hasDefaultVisibility()))
3483 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3485 // Force this to be a tail call. The verifier rules are enough to ensure
3486 // that we can lower this successfully without moving the return address
3489 } else if (isTailCall) {
3490 // Check if it's really possible to do a tail call.
3491 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3492 isVarArg, SR != NotStructReturn,
3493 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3494 Outs, OutVals, Ins, DAG);
3496 // Sibcalls are automatically detected tailcalls which do not require
3498 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3505 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3506 "Var args not supported with calling convention fastcc, ghc or hipe");
3508 // Analyze operands of the call, assigning locations to each operand.
3509 SmallVector<CCValAssign, 16> ArgLocs;
3510 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3512 // Allocate shadow area for Win64.
3514 CCInfo.AllocateStack(32, 8);
3516 CCInfo.AnalyzeArguments(Outs, CC_X86);
3518 // In vectorcall calling convention a second pass is required for the HVA
3520 if (CallingConv::X86_VectorCall == CallConv) {
3521 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3524 // Get a count of how many bytes are to be pushed on the stack.
3525 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3527 // This is a sibcall. The memory operands are available in caller's
3528 // own caller's stack.
3530 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3531 canGuaranteeTCO(CallConv))
3532 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3535 if (isTailCall && !IsSibcall && !IsMustTail) {
3536 // Lower arguments at fp - stackoffset + fpdiff.
3537 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3539 FPDiff = NumBytesCallerPushed - NumBytes;
3541 // Set the delta of movement of the returnaddr stackslot.
3542 // But only set if delta is greater than previous delta.
3543 if (FPDiff < X86Info->getTCReturnAddrDelta())
3544 X86Info->setTCReturnAddrDelta(FPDiff);
3547 unsigned NumBytesToPush = NumBytes;
3548 unsigned NumBytesToPop = NumBytes;
3550 // If we have an inalloca argument, all stack space has already been allocated
3551 // for us and be right at the top of the stack. We don't support multiple
3552 // arguments passed in memory when using inalloca.
3553 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3555 if (!ArgLocs.back().isMemLoc())
3556 report_fatal_error("cannot use inalloca attribute on a register "
3558 if (ArgLocs.back().getLocMemOffset() != 0)
3559 report_fatal_error("any parameter with the inalloca attribute must be "
3560 "the only memory argument");
3564 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3565 NumBytes - NumBytesToPush, dl);
3567 SDValue RetAddrFrIdx;
3568 // Load return address for tail calls.
3569 if (isTailCall && FPDiff)
3570 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3571 Is64Bit, FPDiff, dl);
3573 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3574 SmallVector<SDValue, 8> MemOpChains;
3577 // The next loop assumes that the locations are in the same order of the
3579 assert(isSortedByValueNo(ArgLocs) &&
3580 "Argument Location list must be sorted before lowering");
3582 // Walk the register/memloc assignments, inserting copies/loads. In the case
3583 // of tail call optimization arguments are handle later.
3584 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3585 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3587 assert(OutIndex < Outs.size() && "Invalid Out index");
3588 // Skip inalloca arguments, they have already been written.
3589 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3590 if (Flags.isInAlloca())
3593 CCValAssign &VA = ArgLocs[I];
3594 EVT RegVT = VA.getLocVT();
3595 SDValue Arg = OutVals[OutIndex];
3596 bool isByVal = Flags.isByVal();
3598 // Promote the value if needed.
3599 switch (VA.getLocInfo()) {
3600 default: llvm_unreachable("Unknown loc info!");
3601 case CCValAssign::Full: break;
3602 case CCValAssign::SExt:
3603 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3605 case CCValAssign::ZExt:
3606 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3608 case CCValAssign::AExt:
3609 if (Arg.getValueType().isVector() &&
3610 Arg.getValueType().getVectorElementType() == MVT::i1)
3611 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3612 else if (RegVT.is128BitVector()) {
3613 // Special case: passing MMX values in XMM registers.
3614 Arg = DAG.getBitcast(MVT::i64, Arg);
3615 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3616 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3618 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3620 case CCValAssign::BCvt:
3621 Arg = DAG.getBitcast(RegVT, Arg);
3623 case CCValAssign::Indirect: {
3624 // Store the argument.
3625 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3626 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3627 Chain = DAG.getStore(
3628 Chain, dl, Arg, SpillSlot,
3629 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3635 if (VA.needsCustom()) {
3636 assert(VA.getValVT() == MVT::v64i1 &&
3637 "Currently the only custom case is when we split v64i1 to 2 regs");
3638 // Split v64i1 value into two registers
3639 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3641 } else if (VA.isRegLoc()) {
3642 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3643 if (isVarArg && IsWin64) {
3644 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3645 // shadow reg if callee is a varargs function.
3646 unsigned ShadowReg = 0;
3647 switch (VA.getLocReg()) {
3648 case X86::XMM0: ShadowReg = X86::RCX; break;
3649 case X86::XMM1: ShadowReg = X86::RDX; break;
3650 case X86::XMM2: ShadowReg = X86::R8; break;
3651 case X86::XMM3: ShadowReg = X86::R9; break;
3654 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3656 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3657 assert(VA.isMemLoc());
3658 if (!StackPtr.getNode())
3659 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3660 getPointerTy(DAG.getDataLayout()));
3661 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3662 dl, DAG, VA, Flags));
3666 if (!MemOpChains.empty())
3667 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3669 if (Subtarget.isPICStyleGOT()) {
3670 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3673 RegsToPass.push_back(std::make_pair(
3674 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3675 getPointerTy(DAG.getDataLayout()))));
3677 // If we are tail calling and generating PIC/GOT style code load the
3678 // address of the callee into ECX. The value in ecx is used as target of
3679 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3680 // for tail calls on PIC/GOT architectures. Normally we would just put the
3681 // address of GOT into ebx and then call target@PLT. But for tail calls
3682 // ebx would be restored (since ebx is callee saved) before jumping to the
3685 // Note: The actual moving to ECX is done further down.
3686 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3687 if (G && !G->getGlobal()->hasLocalLinkage() &&
3688 G->getGlobal()->hasDefaultVisibility())
3689 Callee = LowerGlobalAddress(Callee, DAG);
3690 else if (isa<ExternalSymbolSDNode>(Callee))
3691 Callee = LowerExternalSymbol(Callee, DAG);
3695 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3696 // From AMD64 ABI document:
3697 // For calls that may call functions that use varargs or stdargs
3698 // (prototype-less calls or calls to functions containing ellipsis (...) in
3699 // the declaration) %al is used as hidden argument to specify the number
3700 // of SSE registers used. The contents of %al do not need to match exactly
3701 // the number of registers, but must be an ubound on the number of SSE
3702 // registers used and is in the range 0 - 8 inclusive.
3704 // Count the number of XMM registers allocated.
3705 static const MCPhysReg XMMArgRegs[] = {
3706 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3707 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3709 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3710 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3711 && "SSE registers cannot be used when SSE is disabled");
3713 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3714 DAG.getConstant(NumXMMRegs, dl,
3718 if (isVarArg && IsMustTail) {
3719 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3720 for (const auto &F : Forwards) {
3721 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3722 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3726 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3727 // don't need this because the eligibility check rejects calls that require
3728 // shuffling arguments passed in memory.
3729 if (!IsSibcall && isTailCall) {
3730 // Force all the incoming stack arguments to be loaded from the stack
3731 // before any new outgoing arguments are stored to the stack, because the
3732 // outgoing stack slots may alias the incoming argument stack slots, and
3733 // the alias isn't otherwise explicit. This is slightly more conservative
3734 // than necessary, because it means that each store effectively depends
3735 // on every argument instead of just those arguments it would clobber.
3736 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3738 SmallVector<SDValue, 8> MemOpChains2;
3741 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3743 CCValAssign &VA = ArgLocs[I];
3745 if (VA.isRegLoc()) {
3746 if (VA.needsCustom()) {
3747 assert((CallConv == CallingConv::X86_RegCall) &&
3748 "Expecting custom case only in regcall calling convention");
3749 // This means that we are in special case where one argument was
3750 // passed through two register locations - Skip the next location
3757 assert(VA.isMemLoc());
3758 SDValue Arg = OutVals[OutsIndex];
3759 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3760 // Skip inalloca arguments. They don't require any work.
3761 if (Flags.isInAlloca())
3763 // Create frame index.
3764 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3765 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3766 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3767 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3769 if (Flags.isByVal()) {
3770 // Copy relative to framepointer.
3771 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3772 if (!StackPtr.getNode())
3773 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3774 getPointerTy(DAG.getDataLayout()));
3775 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3778 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3782 // Store relative to framepointer.
3783 MemOpChains2.push_back(DAG.getStore(
3784 ArgChain, dl, Arg, FIN,
3785 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3789 if (!MemOpChains2.empty())
3790 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3792 // Store the return address to the appropriate stack slot.
3793 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3794 getPointerTy(DAG.getDataLayout()),
3795 RegInfo->getSlotSize(), FPDiff, dl);
3798 // Build a sequence of copy-to-reg nodes chained together with token chain
3799 // and flag operands which copy the outgoing args into registers.
3801 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3802 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3803 RegsToPass[i].second, InFlag);
3804 InFlag = Chain.getValue(1);
3807 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3808 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3809 // In the 64-bit large code model, we have to make all calls
3810 // through a register, since the call instruction's 32-bit
3811 // pc-relative offset may not be large enough to hold the whole
3813 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3814 // If the callee is a GlobalAddress node (quite common, every direct call
3815 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3817 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3819 // We should use extra load for direct calls to dllimported functions in
3821 const GlobalValue *GV = G->getGlobal();
3822 if (!GV->hasDLLImportStorageClass()) {
3823 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3825 Callee = DAG.getTargetGlobalAddress(
3826 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3828 if (OpFlags == X86II::MO_GOTPCREL) {
3830 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3831 getPointerTy(DAG.getDataLayout()), Callee);
3832 // Add extra indirection
3833 Callee = DAG.getLoad(
3834 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3835 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3838 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3839 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3840 unsigned char OpFlags =
3841 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3843 Callee = DAG.getTargetExternalSymbol(
3844 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3846 if (OpFlags == X86II::MO_GOTPCREL) {
3847 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3848 getPointerTy(DAG.getDataLayout()), Callee);
3849 Callee = DAG.getLoad(
3850 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3851 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3853 } else if (Subtarget.isTarget64BitILP32() &&
3854 Callee->getValueType(0) == MVT::i32) {
3855 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3856 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3859 // Returns a chain & a flag for retval copy to use.
3860 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3861 SmallVector<SDValue, 8> Ops;
3863 if (!IsSibcall && isTailCall) {
3864 Chain = DAG.getCALLSEQ_END(Chain,
3865 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3866 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3867 InFlag = Chain.getValue(1);
3870 Ops.push_back(Chain);
3871 Ops.push_back(Callee);
3874 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3876 // Add argument registers to the end of the list so that they are known live
3878 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3879 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3880 RegsToPass[i].second.getValueType()));
3882 // Add a register mask operand representing the call-preserved registers.
3883 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3884 // set X86_INTR calling convention because it has the same CSR mask
3885 // (same preserved registers).
3886 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3887 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3888 assert(Mask && "Missing call preserved mask for calling convention");
3890 // If this is an invoke in a 32-bit function using a funclet-based
3891 // personality, assume the function clobbers all registers. If an exception
3892 // is thrown, the runtime will not restore CSRs.
3893 // FIXME: Model this more precisely so that we can register allocate across
3894 // the normal edge and spill and fill across the exceptional edge.
3895 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3896 const Function &CallerFn = MF.getFunction();
3897 EHPersonality Pers =
3898 CallerFn.hasPersonalityFn()
3899 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3900 : EHPersonality::Unknown;
3901 if (isFuncletEHPersonality(Pers))
3902 Mask = RegInfo->getNoPreservedMask();
3905 // Define a new register mask from the existing mask.
3906 uint32_t *RegMask = nullptr;
3908 // In some calling conventions we need to remove the used physical registers
3909 // from the reg mask.
3910 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3911 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3913 // Allocate a new Reg Mask and copy Mask.
3914 RegMask = MF.allocateRegMask();
3915 unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
3916 memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
3918 // Make sure all sub registers of the argument registers are reset
3920 for (auto const &RegPair : RegsToPass)
3921 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3922 SubRegs.isValid(); ++SubRegs)
3923 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3925 // Create the RegMask Operand according to our updated mask.
3926 Ops.push_back(DAG.getRegisterMask(RegMask));
3928 // Create the RegMask Operand according to the static mask.
3929 Ops.push_back(DAG.getRegisterMask(Mask));
3932 if (InFlag.getNode())
3933 Ops.push_back(InFlag);
3937 //// If this is the first return lowered for this function, add the regs
3938 //// to the liveout set for the function.
3939 // This isn't right, although it's probably harmless on x86; liveouts
3940 // should be computed from returns not tail calls. Consider a void
3941 // function making a tail call to a function returning int.
3942 MF.getFrameInfo().setHasTailCall();
3943 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3946 if (HasNoCfCheck && IsCFProtectionSupported) {
3947 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
3949 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3951 InFlag = Chain.getValue(1);
3953 // Create the CALLSEQ_END node.
3954 unsigned NumBytesForCalleeToPop;
3955 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3956 DAG.getTarget().Options.GuaranteedTailCallOpt))
3957 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3958 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3959 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3960 SR == StackStructReturn)
3961 // If this is a call to a struct-return function, the callee
3962 // pops the hidden struct pointer, so we have to push it back.
3963 // This is common for Darwin/X86, Linux & Mingw32 targets.
3964 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3965 NumBytesForCalleeToPop = 4;
3967 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3969 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3970 // No need to reset the stack after the call if the call doesn't return. To
3971 // make the MI verify, we'll pretend the callee does it for us.
3972 NumBytesForCalleeToPop = NumBytes;
3975 // Returns a flag for retval copy to use.
3977 Chain = DAG.getCALLSEQ_END(Chain,
3978 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3979 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3982 InFlag = Chain.getValue(1);
3985 // Handle result values, copying them out of physregs into vregs that we
3987 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3991 //===----------------------------------------------------------------------===//
3992 // Fast Calling Convention (tail call) implementation
3993 //===----------------------------------------------------------------------===//
3995 // Like std call, callee cleans arguments, convention except that ECX is
3996 // reserved for storing the tail called function address. Only 2 registers are
3997 // free for argument passing (inreg). Tail call optimization is performed
3999 // * tailcallopt is enabled
4000 // * caller/callee are fastcc
4001 // On X86_64 architecture with GOT-style position independent code only local
4002 // (within module) calls are supported at the moment.
4003 // To keep the stack aligned according to platform abi the function
4004 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4005 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4006 // If a tail called function callee has more arguments than the caller the
4007 // caller needs to make sure that there is room to move the RETADDR to. This is
4008 // achieved by reserving an area the size of the argument delta right after the
4009 // original RETADDR, but before the saved framepointer or the spilled registers
4010 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4022 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4025 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4026 SelectionDAG& DAG) const {
4027 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4028 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4029 unsigned StackAlignment = TFI.getStackAlignment();
4030 uint64_t AlignMask = StackAlignment - 1;
4031 int64_t Offset = StackSize;
4032 unsigned SlotSize = RegInfo->getSlotSize();
4033 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4034 // Number smaller than 12 so just add the difference.
4035 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4037 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4038 Offset = ((~AlignMask) & Offset) + StackAlignment +
4039 (StackAlignment-SlotSize);
4044 /// Return true if the given stack call argument is already available in the
4045 /// same position (relatively) of the caller's incoming argument stack.
4047 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4048 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4049 const X86InstrInfo *TII, const CCValAssign &VA) {
4050 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4053 // Look through nodes that don't alter the bits of the incoming value.
4054 unsigned Op = Arg.getOpcode();
4055 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4056 Arg = Arg.getOperand(0);
4059 if (Op == ISD::TRUNCATE) {
4060 const SDValue &TruncInput = Arg.getOperand(0);
4061 if (TruncInput.getOpcode() == ISD::AssertZext &&
4062 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4063 Arg.getValueType()) {
4064 Arg = TruncInput.getOperand(0);
4072 if (Arg.getOpcode() == ISD::CopyFromReg) {
4073 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4074 if (!TargetRegisterInfo::isVirtualRegister(VR))
4076 MachineInstr *Def = MRI->getVRegDef(VR);
4079 if (!Flags.isByVal()) {
4080 if (!TII->isLoadFromStackSlot(*Def, FI))
4083 unsigned Opcode = Def->getOpcode();
4084 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4085 Opcode == X86::LEA64_32r) &&
4086 Def->getOperand(1).isFI()) {
4087 FI = Def->getOperand(1).getIndex();
4088 Bytes = Flags.getByValSize();
4092 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4093 if (Flags.isByVal())
4094 // ByVal argument is passed in as a pointer but it's now being
4095 // dereferenced. e.g.
4096 // define @foo(%struct.X* %A) {
4097 // tail call @bar(%struct.X* byval %A)
4100 SDValue Ptr = Ld->getBasePtr();
4101 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4104 FI = FINode->getIndex();
4105 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4106 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4107 FI = FINode->getIndex();
4108 Bytes = Flags.getByValSize();
4112 assert(FI != INT_MAX);
4113 if (!MFI.isFixedObjectIndex(FI))
4116 if (Offset != MFI.getObjectOffset(FI))
4119 // If this is not byval, check that the argument stack object is immutable.
4120 // inalloca and argument copy elision can create mutable argument stack
4121 // objects. Byval objects can be mutated, but a byval call intends to pass the
4123 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4126 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4127 // If the argument location is wider than the argument type, check that any
4128 // extension flags match.
4129 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4130 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4135 return Bytes == MFI.getObjectSize(FI);
4138 /// Check whether the call is eligible for tail call optimization. Targets
4139 /// that want to do tail call optimization should implement this function.
4140 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4141 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4142 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4143 const SmallVectorImpl<ISD::OutputArg> &Outs,
4144 const SmallVectorImpl<SDValue> &OutVals,
4145 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4146 if (!mayTailCallThisCC(CalleeCC))
4149 // If -tailcallopt is specified, make fastcc functions tail-callable.
4150 MachineFunction &MF = DAG.getMachineFunction();
4151 const Function &CallerF = MF.getFunction();
4153 // If the function return type is x86_fp80 and the callee return type is not,
4154 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4155 // perform a tailcall optimization here.
4156 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4159 CallingConv::ID CallerCC = CallerF.getCallingConv();
4160 bool CCMatch = CallerCC == CalleeCC;
4161 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4162 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4164 // Win64 functions have extra shadow space for argument homing. Don't do the
4165 // sibcall if the caller and callee have mismatched expectations for this
4167 if (IsCalleeWin64 != IsCallerWin64)
4170 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4171 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4176 // Look for obvious safe cases to perform tail call optimization that do not
4177 // require ABI changes. This is what gcc calls sibcall.
4179 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4180 // emit a special epilogue.
4181 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4182 if (RegInfo->needsStackRealignment(MF))
4185 // Also avoid sibcall optimization if either caller or callee uses struct
4186 // return semantics.
4187 if (isCalleeStructRet || isCallerStructRet)
4190 // Do not sibcall optimize vararg calls unless all arguments are passed via
4192 LLVMContext &C = *DAG.getContext();
4193 if (isVarArg && !Outs.empty()) {
4194 // Optimizing for varargs on Win64 is unlikely to be safe without
4195 // additional testing.
4196 if (IsCalleeWin64 || IsCallerWin64)
4199 SmallVector<CCValAssign, 16> ArgLocs;
4200 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4202 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4203 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4204 if (!ArgLocs[i].isRegLoc())
4208 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4209 // stack. Therefore, if it's not used by the call it is not safe to optimize
4210 // this into a sibcall.
4211 bool Unused = false;
4212 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4219 SmallVector<CCValAssign, 16> RVLocs;
4220 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4221 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4222 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4223 CCValAssign &VA = RVLocs[i];
4224 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4229 // Check that the call results are passed in the same way.
4230 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4231 RetCC_X86, RetCC_X86))
4233 // The callee has to preserve all registers the caller needs to preserve.
4234 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4235 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4237 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4238 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4242 unsigned StackArgsSize = 0;
4244 // If the callee takes no arguments then go on to check the results of the
4246 if (!Outs.empty()) {
4247 // Check if stack adjustment is needed. For now, do not do this if any
4248 // argument is passed on the stack.
4249 SmallVector<CCValAssign, 16> ArgLocs;
4250 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4252 // Allocate shadow area for Win64
4254 CCInfo.AllocateStack(32, 8);
4256 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4257 StackArgsSize = CCInfo.getNextStackOffset();
4259 if (CCInfo.getNextStackOffset()) {
4260 // Check if the arguments are already laid out in the right way as
4261 // the caller's fixed stack objects.
4262 MachineFrameInfo &MFI = MF.getFrameInfo();
4263 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4264 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4265 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4266 CCValAssign &VA = ArgLocs[i];
4267 SDValue Arg = OutVals[i];
4268 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4269 if (VA.getLocInfo() == CCValAssign::Indirect)
4271 if (!VA.isRegLoc()) {
4272 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4279 bool PositionIndependent = isPositionIndependent();
4280 // If the tailcall address may be in a register, then make sure it's
4281 // possible to register allocate for it. In 32-bit, the call address can
4282 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4283 // callee-saved registers are restored. These happen to be the same
4284 // registers used to pass 'inreg' arguments so watch out for those.
4285 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4286 !isa<ExternalSymbolSDNode>(Callee)) ||
4287 PositionIndependent)) {
4288 unsigned NumInRegs = 0;
4289 // In PIC we need an extra register to formulate the address computation
4291 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4293 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4294 CCValAssign &VA = ArgLocs[i];
4297 unsigned Reg = VA.getLocReg();
4300 case X86::EAX: case X86::EDX: case X86::ECX:
4301 if (++NumInRegs == MaxInRegs)
4308 const MachineRegisterInfo &MRI = MF.getRegInfo();
4309 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4313 bool CalleeWillPop =
4314 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4315 MF.getTarget().Options.GuaranteedTailCallOpt);
4317 if (unsigned BytesToPop =
4318 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4319 // If we have bytes to pop, the callee must pop them.
4320 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4321 if (!CalleePopMatches)
4323 } else if (CalleeWillPop && StackArgsSize > 0) {
4324 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4332 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4333 const TargetLibraryInfo *libInfo) const {
4334 return X86::createFastISel(funcInfo, libInfo);
4337 //===----------------------------------------------------------------------===//
4338 // Other Lowering Hooks
4339 //===----------------------------------------------------------------------===//
4341 static bool MayFoldLoad(SDValue Op) {
4342 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4345 static bool MayFoldIntoStore(SDValue Op) {
4346 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4349 static bool MayFoldIntoZeroExtend(SDValue Op) {
4350 if (Op.hasOneUse()) {
4351 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4352 return (ISD::ZERO_EXTEND == Opcode);
4357 static bool isTargetShuffle(unsigned Opcode) {
4359 default: return false;
4360 case X86ISD::BLENDI:
4361 case X86ISD::PSHUFB:
4362 case X86ISD::PSHUFD:
4363 case X86ISD::PSHUFHW:
4364 case X86ISD::PSHUFLW:
4366 case X86ISD::INSERTPS:
4367 case X86ISD::EXTRQI:
4368 case X86ISD::INSERTQI:
4369 case X86ISD::PALIGNR:
4370 case X86ISD::VSHLDQ:
4371 case X86ISD::VSRLDQ:
4372 case X86ISD::MOVLHPS:
4373 case X86ISD::MOVHLPS:
4374 case X86ISD::MOVSHDUP:
4375 case X86ISD::MOVSLDUP:
4376 case X86ISD::MOVDDUP:
4379 case X86ISD::UNPCKL:
4380 case X86ISD::UNPCKH:
4381 case X86ISD::VBROADCAST:
4382 case X86ISD::VPERMILPI:
4383 case X86ISD::VPERMILPV:
4384 case X86ISD::VPERM2X128:
4385 case X86ISD::SHUF128:
4386 case X86ISD::VPERMIL2:
4387 case X86ISD::VPERMI:
4388 case X86ISD::VPPERM:
4389 case X86ISD::VPERMV:
4390 case X86ISD::VPERMV3:
4391 case X86ISD::VZEXT_MOVL:
4396 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4398 default: return false;
4400 case X86ISD::PSHUFB:
4401 case X86ISD::VPERMILPV:
4402 case X86ISD::VPERMIL2:
4403 case X86ISD::VPPERM:
4404 case X86ISD::VPERMV:
4405 case X86ISD::VPERMV3:
4407 // 'Faux' Target Shuffles.
4414 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4415 MachineFunction &MF = DAG.getMachineFunction();
4416 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4417 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4418 int ReturnAddrIndex = FuncInfo->getRAIndex();
4420 if (ReturnAddrIndex == 0) {
4421 // Set up a frame object for the return address.
4422 unsigned SlotSize = RegInfo->getSlotSize();
4423 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4426 FuncInfo->setRAIndex(ReturnAddrIndex);
4429 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4432 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4433 bool hasSymbolicDisplacement) {
4434 // Offset should fit into 32 bit immediate field.
4435 if (!isInt<32>(Offset))
4438 // If we don't have a symbolic displacement - we don't have any extra
4440 if (!hasSymbolicDisplacement)
4443 // FIXME: Some tweaks might be needed for medium code model.
4444 if (M != CodeModel::Small && M != CodeModel::Kernel)
4447 // For small code model we assume that latest object is 16MB before end of 31
4448 // bits boundary. We may also accept pretty large negative constants knowing
4449 // that all objects are in the positive half of address space.
4450 if (M == CodeModel::Small && Offset < 16*1024*1024)
4453 // For kernel code model we know that all object resist in the negative half
4454 // of 32bits address space. We may not accept negative offsets, since they may
4455 // be just off and we may accept pretty large positive ones.
4456 if (M == CodeModel::Kernel && Offset >= 0)
4462 /// Determines whether the callee is required to pop its own arguments.
4463 /// Callee pop is necessary to support tail calls.
4464 bool X86::isCalleePop(CallingConv::ID CallingConv,
4465 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4466 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4467 // can guarantee TCO.
4468 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4471 switch (CallingConv) {
4474 case CallingConv::X86_StdCall:
4475 case CallingConv::X86_FastCall:
4476 case CallingConv::X86_ThisCall:
4477 case CallingConv::X86_VectorCall:
4482 /// Return true if the condition is an unsigned comparison operation.
4483 static bool isX86CCUnsigned(unsigned X86CC) {
4486 llvm_unreachable("Invalid integer condition!");
4502 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4503 switch (SetCCOpcode) {
4504 default: llvm_unreachable("Invalid integer condition!");
4505 case ISD::SETEQ: return X86::COND_E;
4506 case ISD::SETGT: return X86::COND_G;
4507 case ISD::SETGE: return X86::COND_GE;
4508 case ISD::SETLT: return X86::COND_L;
4509 case ISD::SETLE: return X86::COND_LE;
4510 case ISD::SETNE: return X86::COND_NE;
4511 case ISD::SETULT: return X86::COND_B;
4512 case ISD::SETUGT: return X86::COND_A;
4513 case ISD::SETULE: return X86::COND_BE;
4514 case ISD::SETUGE: return X86::COND_AE;
4518 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4519 /// condition code, returning the condition code and the LHS/RHS of the
4520 /// comparison to make.
4521 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4522 bool isFP, SDValue &LHS, SDValue &RHS,
4523 SelectionDAG &DAG) {
4525 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4526 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4527 // X > -1 -> X == 0, jump !sign.
4528 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4529 return X86::COND_NS;
4531 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4532 // X < 0 -> X == 0, jump on sign.
4535 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4537 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4538 return X86::COND_LE;
4542 return TranslateIntegerX86CC(SetCCOpcode);
4545 // First determine if it is required or is profitable to flip the operands.
4547 // If LHS is a foldable load, but RHS is not, flip the condition.
4548 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4549 !ISD::isNON_EXTLoad(RHS.getNode())) {
4550 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4551 std::swap(LHS, RHS);
4554 switch (SetCCOpcode) {
4560 std::swap(LHS, RHS);
4564 // On a floating point condition, the flags are set as follows:
4566 // 0 | 0 | 0 | X > Y
4567 // 0 | 0 | 1 | X < Y
4568 // 1 | 0 | 0 | X == Y
4569 // 1 | 1 | 1 | unordered
4570 switch (SetCCOpcode) {
4571 default: llvm_unreachable("Condcode should be pre-legalized away");
4573 case ISD::SETEQ: return X86::COND_E;
4574 case ISD::SETOLT: // flipped
4576 case ISD::SETGT: return X86::COND_A;
4577 case ISD::SETOLE: // flipped
4579 case ISD::SETGE: return X86::COND_AE;
4580 case ISD::SETUGT: // flipped
4582 case ISD::SETLT: return X86::COND_B;
4583 case ISD::SETUGE: // flipped
4585 case ISD::SETLE: return X86::COND_BE;
4587 case ISD::SETNE: return X86::COND_NE;
4588 case ISD::SETUO: return X86::COND_P;
4589 case ISD::SETO: return X86::COND_NP;
4591 case ISD::SETUNE: return X86::COND_INVALID;
4595 /// Is there a floating point cmov for the specific X86 condition code?
4596 /// Current x86 isa includes the following FP cmov instructions:
4597 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4598 static bool hasFPCMov(unsigned X86CC) {
4615 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4617 MachineFunction &MF,
4618 unsigned Intrinsic) const {
4620 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4624 Info.opc = ISD::INTRINSIC_W_CHAIN;
4625 Info.flags = MachineMemOperand::MONone;
4628 switch (IntrData->Type) {
4629 case TRUNCATE_TO_MEM_VI8:
4630 case TRUNCATE_TO_MEM_VI16:
4631 case TRUNCATE_TO_MEM_VI32: {
4632 Info.ptrVal = I.getArgOperand(0);
4633 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4634 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4635 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4637 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4638 ScalarVT = MVT::i16;
4639 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4640 ScalarVT = MVT::i32;
4642 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4644 Info.flags |= MachineMemOperand::MOStore;
4654 /// Returns true if the target can instruction select the
4655 /// specified FP immediate natively. If false, the legalizer will
4656 /// materialize the FP immediate as a load from a constant pool.
4657 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4658 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4659 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4665 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4666 ISD::LoadExtType ExtTy,
4668 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4669 // relocation target a movq or addq instruction: don't let the load shrink.
4670 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4671 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4672 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4673 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4677 /// Returns true if it is beneficial to convert a load of a constant
4678 /// to just the constant itself.
4679 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4681 assert(Ty->isIntegerTy());
4683 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4684 if (BitSize == 0 || BitSize > 64)
4689 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4690 // TODO: It might be a win to ease or lift this restriction, but the generic
4691 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4692 if (VT.isVector() && Subtarget.hasAVX512())
4698 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4699 unsigned Index) const {
4700 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4703 // Mask vectors support all subregister combinations and operations that
4704 // extract half of vector.
4705 if (ResVT.getVectorElementType() == MVT::i1)
4706 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4707 (Index == ResVT.getVectorNumElements()));
4709 return (Index % ResVT.getVectorNumElements()) == 0;
4712 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4713 // Speculate cttz only if we can directly use TZCNT.
4714 return Subtarget.hasBMI();
4717 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4718 // Speculate ctlz only if we can directly use LZCNT.
4719 return Subtarget.hasLZCNT();
4722 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4723 EVT BitcastVT) const {
4724 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4727 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4730 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4731 const SelectionDAG &DAG) const {
4732 // Do not merge to float value size (128 bytes) if no implicit
4733 // float attribute is set.
4734 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4735 Attribute::NoImplicitFloat);
4738 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4739 return (MemVT.getSizeInBits() <= MaxIntSize);
4744 bool X86TargetLowering::isCtlzFast() const {
4745 return Subtarget.hasFastLZCNT();
4748 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4749 const Instruction &AndI) const {
4753 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4754 EVT VT = Y.getValueType();
4759 if (!Subtarget.hasBMI())
4762 // There are only 32-bit and 64-bit forms for 'andn'.
4763 if (VT != MVT::i32 && VT != MVT::i64)
4766 // A mask and compare against constant is ok for an 'andn' too
4767 // even though the BMI instruction doesn't have an immediate form.
4772 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4773 EVT VT = Y.getValueType();
4775 if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
4776 return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
4780 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4783 if (VT == MVT::v4i32)
4786 return Subtarget.hasSSE2();
4789 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
4790 EVT VT = Y.getValueType();
4792 // For vectors, we don't have a preference, but we probably want a mask.
4796 // 64-bit shifts on 32-bit targets produce really bad bloated code.
4797 if (VT == MVT::i64 && !Subtarget.is64Bit())
4803 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4804 MVT VT = MVT::getIntegerVT(NumBits);
4805 if (isTypeLegal(VT))
4808 // PMOVMSKB can handle this.
4809 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4812 // VPMOVMSKB can handle this.
4813 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4816 // TODO: Allow 64-bit type for 32-bit target.
4817 // TODO: 512-bit types should be allowed, but make sure that those
4818 // cases are handled in combineVectorSizedSetCCEquality().
4820 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4823 /// Val is the undef sentinel value or equal to the specified value.
4824 static bool isUndefOrEqual(int Val, int CmpVal) {
4825 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4828 /// Val is either the undef or zero sentinel value.
4829 static bool isUndefOrZero(int Val) {
4830 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4833 /// Return true if every element in Mask, beginning
4834 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4835 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4836 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4837 if (Mask[i] != SM_SentinelUndef)
4842 /// Return true if Val falls within the specified range (L, H].
4843 static bool isInRange(int Val, int Low, int Hi) {
4844 return (Val >= Low && Val < Hi);
4847 /// Return true if the value of any element in Mask falls within the specified
4849 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
4851 if (isInRange(M, Low, Hi))
4856 /// Return true if Val is undef or if its value falls within the
4857 /// specified range (L, H].
4858 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4859 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
4862 /// Return true if every element in Mask is undef or if its value
4863 /// falls within the specified range (L, H].
4864 static bool isUndefOrInRange(ArrayRef<int> Mask,
4867 if (!isUndefOrInRange(M, Low, Hi))
4872 /// Return true if Val is undef, zero or if its value falls within the
4873 /// specified range (L, H].
4874 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4875 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
4878 /// Return true if every element in Mask is undef, zero or if its value
4879 /// falls within the specified range (L, H].
4880 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4882 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4887 /// Return true if every element in Mask, beginning
4888 /// from position Pos and ending in Pos + Size, falls within the specified
4889 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4890 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4891 unsigned Size, int Low, int Step = 1) {
4892 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4893 if (!isUndefOrEqual(Mask[i], Low))
4898 /// Return true if every element in Mask, beginning
4899 /// from position Pos and ending in Pos+Size, falls within the specified
4900 /// sequential range (Low, Low+Size], or is undef or is zero.
4901 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4902 unsigned Size, int Low) {
4903 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4904 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4909 /// Return true if every element in Mask, beginning
4910 /// from position Pos and ending in Pos+Size is undef or is zero.
4911 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4913 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4914 if (!isUndefOrZero(Mask[i]))
4919 /// Helper function to test whether a shuffle mask could be
4920 /// simplified by widening the elements being shuffled.
4922 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4923 /// leaves it in an unspecified state.
4925 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4926 /// shuffle masks. The latter have the special property of a '-2' representing
4927 /// a zero-ed lane of a vector.
4928 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4929 SmallVectorImpl<int> &WidenedMask) {
4930 WidenedMask.assign(Mask.size() / 2, 0);
4931 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4933 int M1 = Mask[i + 1];
4935 // If both elements are undef, its trivial.
4936 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4937 WidenedMask[i / 2] = SM_SentinelUndef;
4941 // Check for an undef mask and a mask value properly aligned to fit with
4942 // a pair of values. If we find such a case, use the non-undef mask's value.
4943 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4944 WidenedMask[i / 2] = M1 / 2;
4947 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4948 WidenedMask[i / 2] = M0 / 2;
4952 // When zeroing, we need to spread the zeroing across both lanes to widen.
4953 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4954 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4955 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4956 WidenedMask[i / 2] = SM_SentinelZero;
4962 // Finally check if the two mask values are adjacent and aligned with
4964 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4965 WidenedMask[i / 2] = M0 / 2;
4969 // Otherwise we can't safely widen the elements used in this shuffle.
4972 assert(WidenedMask.size() == Mask.size() / 2 &&
4973 "Incorrect size of mask after widening the elements!");
4978 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4979 const APInt &Zeroable,
4980 SmallVectorImpl<int> &WidenedMask) {
4981 SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
4982 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
4983 if (TargetMask[i] == SM_SentinelUndef)
4986 TargetMask[i] = SM_SentinelZero;
4988 return canWidenShuffleElements(TargetMask, WidenedMask);
4991 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
4992 SmallVector<int, 32> WidenedMask;
4993 return canWidenShuffleElements(Mask, WidenedMask);
4996 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4997 bool X86::isZeroNode(SDValue Elt) {
4998 return isNullConstant(Elt) || isNullFPConstant(Elt);
5001 // Build a vector of constants.
5002 // Use an UNDEF node if MaskElt == -1.
5003 // Split 64-bit constants in the 32-bit mode.
5004 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
5005 const SDLoc &dl, bool IsMask = false) {
5007 SmallVector<SDValue, 32> Ops;
5010 MVT ConstVecVT = VT;
5011 unsigned NumElts = VT.getVectorNumElements();
5012 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5013 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5014 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5018 MVT EltVT = ConstVecVT.getVectorElementType();
5019 for (unsigned i = 0; i < NumElts; ++i) {
5020 bool IsUndef = Values[i] < 0 && IsMask;
5021 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5022 DAG.getConstant(Values[i], dl, EltVT);
5023 Ops.push_back(OpNode);
5025 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5026 DAG.getConstant(0, dl, EltVT));
5028 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5030 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5034 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5035 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5036 assert(Bits.size() == Undefs.getBitWidth() &&
5037 "Unequal constant and undef arrays");
5038 SmallVector<SDValue, 32> Ops;
5041 MVT ConstVecVT = VT;
5042 unsigned NumElts = VT.getVectorNumElements();
5043 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5044 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5045 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5049 MVT EltVT = ConstVecVT.getVectorElementType();
5050 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5052 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5055 const APInt &V = Bits[i];
5056 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5058 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5059 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5060 } else if (EltVT == MVT::f32) {
5061 APFloat FV(APFloat::IEEEsingle(), V);
5062 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5063 } else if (EltVT == MVT::f64) {
5064 APFloat FV(APFloat::IEEEdouble(), V);
5065 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5067 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5071 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5072 return DAG.getBitcast(VT, ConstsNode);
5075 /// Returns a vector of specified type with all zero elements.
5076 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5077 SelectionDAG &DAG, const SDLoc &dl) {
5078 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5079 VT.getVectorElementType() == MVT::i1) &&
5080 "Unexpected vector type");
5082 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5083 // type. This ensures they get CSE'd. But if the integer type is not
5084 // available, use a floating-point +0.0 instead.
5086 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5087 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5088 } else if (VT.getVectorElementType() == MVT::i1) {
5089 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5090 "Unexpected vector type");
5091 Vec = DAG.getConstant(0, dl, VT);
5093 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5094 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5096 return DAG.getBitcast(VT, Vec);
5099 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5100 const SDLoc &dl, unsigned vectorWidth) {
5101 EVT VT = Vec.getValueType();
5102 EVT ElVT = VT.getVectorElementType();
5103 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5104 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5105 VT.getVectorNumElements()/Factor);
5107 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5108 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5109 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5111 // This is the index of the first element of the vectorWidth-bit chunk
5112 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5113 IdxVal &= ~(ElemsPerChunk - 1);
5115 // If the input is a buildvector just emit a smaller one.
5116 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5117 return DAG.getBuildVector(ResultVT, dl,
5118 Vec->ops().slice(IdxVal, ElemsPerChunk));
5120 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5121 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5124 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5125 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5126 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5127 /// instructions or a simple subregister reference. Idx is an index in the
5128 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5129 /// lowering EXTRACT_VECTOR_ELT operations easier.
5130 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5131 SelectionDAG &DAG, const SDLoc &dl) {
5132 assert((Vec.getValueType().is256BitVector() ||
5133 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5134 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5137 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5138 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5139 SelectionDAG &DAG, const SDLoc &dl) {
5140 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5141 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5144 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5145 SelectionDAG &DAG, const SDLoc &dl,
5146 unsigned vectorWidth) {
5147 assert((vectorWidth == 128 || vectorWidth == 256) &&
5148 "Unsupported vector width");
5149 // Inserting UNDEF is Result
5152 EVT VT = Vec.getValueType();
5153 EVT ElVT = VT.getVectorElementType();
5154 EVT ResultVT = Result.getValueType();
5156 // Insert the relevant vectorWidth bits.
5157 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5158 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5160 // This is the index of the first element of the vectorWidth-bit chunk
5161 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5162 IdxVal &= ~(ElemsPerChunk - 1);
5164 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5165 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5168 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5169 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5170 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5171 /// simple superregister reference. Idx is an index in the 128 bits
5172 /// we want. It need not be aligned to a 128-bit boundary. That makes
5173 /// lowering INSERT_VECTOR_ELT operations easier.
5174 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5175 SelectionDAG &DAG, const SDLoc &dl) {
5176 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5177 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5180 /// Widen a vector to a larger size with the same scalar type, with the new
5181 /// elements either zero or undef.
5182 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5183 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5185 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5186 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5187 "Unsupported vector widening type");
5188 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5190 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5191 DAG.getIntPtrConstant(0, dl));
5194 // Helper for splitting operands of an operation to legal target size and
5195 // apply a function on each part.
5196 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5197 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5198 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5199 // The argument Builder is a function that will be applied on each split part:
5200 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5201 template <typename F>
5202 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5203 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5204 F Builder, bool CheckBWI = true) {
5205 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5206 unsigned NumSubs = 1;
5207 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5208 (!CheckBWI && Subtarget.useAVX512Regs())) {
5209 if (VT.getSizeInBits() > 512) {
5210 NumSubs = VT.getSizeInBits() / 512;
5211 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5213 } else if (Subtarget.hasAVX2()) {
5214 if (VT.getSizeInBits() > 256) {
5215 NumSubs = VT.getSizeInBits() / 256;
5216 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5219 if (VT.getSizeInBits() > 128) {
5220 NumSubs = VT.getSizeInBits() / 128;
5221 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5226 return Builder(DAG, DL, Ops);
5228 SmallVector<SDValue, 4> Subs;
5229 for (unsigned i = 0; i != NumSubs; ++i) {
5230 SmallVector<SDValue, 2> SubOps;
5231 for (SDValue Op : Ops) {
5232 EVT OpVT = Op.getValueType();
5233 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5234 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5235 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5237 Subs.push_back(Builder(DAG, DL, SubOps));
5239 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5242 // Return true if the instruction zeroes the unused upper part of the
5243 // destination and accepts mask.
5244 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5249 case X86ISD::CMPM_RND:
5255 /// Insert i1-subvector to i1-vector.
5256 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5257 const X86Subtarget &Subtarget) {
5260 SDValue Vec = Op.getOperand(0);
5261 SDValue SubVec = Op.getOperand(1);
5262 SDValue Idx = Op.getOperand(2);
5264 if (!isa<ConstantSDNode>(Idx))
5267 // Inserting undef is a nop. We can just return the original vector.
5268 if (SubVec.isUndef())
5271 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5272 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5275 MVT OpVT = Op.getSimpleValueType();
5276 unsigned NumElems = OpVT.getVectorNumElements();
5278 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5280 // Extend to natively supported kshift.
5281 MVT WideOpVT = OpVT;
5282 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5283 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5285 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5287 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5288 // May need to promote to a legal type.
5289 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5290 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5292 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5295 MVT SubVecVT = SubVec.getSimpleValueType();
5296 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5298 assert(IdxVal + SubVecNumElems <= NumElems &&
5299 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5300 "Unexpected index value in INSERT_SUBVECTOR");
5302 SDValue Undef = DAG.getUNDEF(WideOpVT);
5305 // Zero lower bits of the Vec
5306 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5307 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5309 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5310 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5311 // Merge them together, SubVec should be zero extended.
5312 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5313 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5315 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5316 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5319 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5320 Undef, SubVec, ZeroIdx);
5322 if (Vec.isUndef()) {
5323 assert(IdxVal != 0 && "Unexpected index");
5324 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5325 DAG.getConstant(IdxVal, dl, MVT::i8));
5326 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5329 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5330 assert(IdxVal != 0 && "Unexpected index");
5331 NumElems = WideOpVT.getVectorNumElements();
5332 unsigned ShiftLeft = NumElems - SubVecNumElems;
5333 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5334 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5335 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5336 if (ShiftRight != 0)
5337 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5338 DAG.getConstant(ShiftRight, dl, MVT::i8));
5339 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5342 // Simple case when we put subvector in the upper part
5343 if (IdxVal + SubVecNumElems == NumElems) {
5344 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5345 DAG.getConstant(IdxVal, dl, MVT::i8));
5346 if (SubVecNumElems * 2 == NumElems) {
5347 // Special case, use legal zero extending insert_subvector. This allows
5348 // isel to opimitize when bits are known zero.
5349 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5350 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5351 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5354 // Otherwise use explicit shifts to zero the bits.
5355 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5356 Undef, Vec, ZeroIdx);
5357 NumElems = WideOpVT.getVectorNumElements();
5358 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5359 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5360 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5362 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5363 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5366 // Inserting into the middle is more complicated.
5368 NumElems = WideOpVT.getVectorNumElements();
5370 // Widen the vector if needed.
5371 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5372 // Move the current value of the bit to be replace to the lsbs.
5373 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5374 DAG.getConstant(IdxVal, dl, MVT::i8));
5375 // Xor with the new bit.
5376 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5377 // Shift to MSB, filling bottom bits with 0.
5378 unsigned ShiftLeft = NumElems - SubVecNumElems;
5379 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5380 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5381 // Shift to the final position, filling upper bits with 0.
5382 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5383 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5384 DAG.getConstant(ShiftRight, dl, MVT::i8));
5385 // Xor with original vector leaving the new value.
5386 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5387 // Reduce to original width if needed.
5388 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5391 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5392 unsigned NumElems, SelectionDAG &DAG,
5393 const SDLoc &dl, unsigned VectorWidth) {
5394 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5395 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5398 /// Returns a vector of specified type with all bits set.
5399 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5400 /// Then bitcast to their original type, ensuring they get CSE'd.
5401 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5402 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5403 "Expected a 128/256/512-bit vector type");
5405 APInt Ones = APInt::getAllOnesValue(32);
5406 unsigned NumElts = VT.getSizeInBits() / 32;
5407 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5408 return DAG.getBitcast(VT, Vec);
5411 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5412 SelectionDAG &DAG) {
5413 EVT InVT = In.getValueType();
5414 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5416 if (VT.is128BitVector() && InVT.is128BitVector())
5417 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5418 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5420 // For 256-bit vectors, we only need the lower (128-bit) input half.
5421 // For 512-bit vectors, we only need the lower input half or quarter.
5422 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5423 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5424 In = extractSubVector(In, 0, DAG, DL,
5425 std::max(128, (int)VT.getSizeInBits() / Scale));
5428 return DAG.getNode(Opc, DL, VT, In);
5431 /// Returns a vector_shuffle node for an unpackl operation.
5432 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5433 SDValue V1, SDValue V2) {
5434 SmallVector<int, 8> Mask;
5435 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5436 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5439 /// Returns a vector_shuffle node for an unpackh operation.
5440 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5441 SDValue V1, SDValue V2) {
5442 SmallVector<int, 8> Mask;
5443 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5444 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5447 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5448 /// This produces a shuffle where the low element of V2 is swizzled into the
5449 /// zero/undef vector, landing at element Idx.
5450 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5451 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5453 const X86Subtarget &Subtarget,
5454 SelectionDAG &DAG) {
5455 MVT VT = V2.getSimpleValueType();
5457 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5458 int NumElems = VT.getVectorNumElements();
5459 SmallVector<int, 16> MaskVec(NumElems);
5460 for (int i = 0; i != NumElems; ++i)
5461 // If this is the insertion idx, put the low elt of V2 here.
5462 MaskVec[i] = (i == Idx) ? NumElems : i;
5463 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5466 static SDValue peekThroughBitcasts(SDValue V) {
5467 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5468 V = V.getOperand(0);
5472 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5473 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5474 V.getOperand(0).hasOneUse())
5475 V = V.getOperand(0);
5479 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5480 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5481 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5482 V = V.getOperand(0);
5486 static const Constant *getTargetConstantFromNode(SDValue Op) {
5487 Op = peekThroughBitcasts(Op);
5489 auto *Load = dyn_cast<LoadSDNode>(Op);
5493 SDValue Ptr = Load->getBasePtr();
5494 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5495 Ptr->getOpcode() == X86ISD::WrapperRIP)
5496 Ptr = Ptr->getOperand(0);
5498 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5499 if (!CNode || CNode->isMachineConstantPoolEntry())
5502 return dyn_cast<Constant>(CNode->getConstVal());
5505 // Extract raw constant bits from constant pools.
5506 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5508 SmallVectorImpl<APInt> &EltBits,
5509 bool AllowWholeUndefs = true,
5510 bool AllowPartialUndefs = true) {
5511 assert(EltBits.empty() && "Expected an empty EltBits vector");
5513 Op = peekThroughBitcasts(Op);
5515 EVT VT = Op.getValueType();
5516 unsigned SizeInBits = VT.getSizeInBits();
5517 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5518 unsigned NumElts = SizeInBits / EltSizeInBits;
5520 // Bitcast a source array of element bits to the target size.
5521 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5522 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5523 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5524 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5525 "Constant bit sizes don't match");
5527 // Don't split if we don't allow undef bits.
5528 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5529 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5532 // If we're already the right size, don't bother bitcasting.
5533 if (NumSrcElts == NumElts) {
5534 UndefElts = UndefSrcElts;
5535 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5539 // Extract all the undef/constant element data and pack into single bitsets.
5540 APInt UndefBits(SizeInBits, 0);
5541 APInt MaskBits(SizeInBits, 0);
5543 for (unsigned i = 0; i != NumSrcElts; ++i) {
5544 unsigned BitOffset = i * SrcEltSizeInBits;
5545 if (UndefSrcElts[i])
5546 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5547 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5550 // Split the undef/constant single bitset data into the target elements.
5551 UndefElts = APInt(NumElts, 0);
5552 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5554 for (unsigned i = 0; i != NumElts; ++i) {
5555 unsigned BitOffset = i * EltSizeInBits;
5556 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5558 // Only treat an element as UNDEF if all bits are UNDEF.
5559 if (UndefEltBits.isAllOnesValue()) {
5560 if (!AllowWholeUndefs)
5562 UndefElts.setBit(i);
5566 // If only some bits are UNDEF then treat them as zero (or bail if not
5568 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5571 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5572 EltBits[i] = Bits.getZExtValue();
5577 // Collect constant bits and insert into mask/undef bit masks.
5578 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5579 unsigned UndefBitIndex) {
5582 if (isa<UndefValue>(Cst)) {
5583 Undefs.setBit(UndefBitIndex);
5586 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5587 Mask = CInt->getValue();
5590 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5591 Mask = CFP->getValueAPF().bitcastToAPInt();
5599 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5600 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5601 return CastBitData(UndefSrcElts, SrcEltBits);
5604 // Extract scalar constant bits.
5605 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5606 APInt UndefSrcElts = APInt::getNullValue(1);
5607 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5608 return CastBitData(UndefSrcElts, SrcEltBits);
5610 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5611 APInt UndefSrcElts = APInt::getNullValue(1);
5612 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5613 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5614 return CastBitData(UndefSrcElts, SrcEltBits);
5617 // Extract constant bits from build vector.
5618 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5619 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5620 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5622 APInt UndefSrcElts(NumSrcElts, 0);
5623 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5624 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5625 const SDValue &Src = Op.getOperand(i);
5626 if (Src.isUndef()) {
5627 UndefSrcElts.setBit(i);
5630 auto *Cst = cast<ConstantSDNode>(Src);
5631 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5633 return CastBitData(UndefSrcElts, SrcEltBits);
5636 // Extract constant bits from constant pool vector.
5637 if (auto *Cst = getTargetConstantFromNode(Op)) {
5638 Type *CstTy = Cst->getType();
5639 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5642 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5643 unsigned NumSrcElts = CstTy->getVectorNumElements();
5645 APInt UndefSrcElts(NumSrcElts, 0);
5646 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5647 for (unsigned i = 0; i != NumSrcElts; ++i)
5648 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5652 return CastBitData(UndefSrcElts, SrcEltBits);
5655 // Extract constant bits from a broadcasted constant pool scalar.
5656 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5657 EltSizeInBits <= VT.getScalarSizeInBits()) {
5658 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5659 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5660 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5662 APInt UndefSrcElts(NumSrcElts, 0);
5663 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5664 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5665 if (UndefSrcElts[0])
5666 UndefSrcElts.setBits(0, NumSrcElts);
5667 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5668 return CastBitData(UndefSrcElts, SrcEltBits);
5673 // Extract a rematerialized scalar constant insertion.
5674 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5675 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5676 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5677 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5678 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5680 APInt UndefSrcElts(NumSrcElts, 0);
5681 SmallVector<APInt, 64> SrcEltBits;
5682 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5683 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5684 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5685 return CastBitData(UndefSrcElts, SrcEltBits);
5691 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5692 unsigned MaskEltSizeInBits,
5693 SmallVectorImpl<uint64_t> &RawMask) {
5695 SmallVector<APInt, 64> EltBits;
5697 // Extract the raw target constant bits.
5698 // FIXME: We currently don't support UNDEF bits or mask entries.
5699 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5700 EltBits, /* AllowWholeUndefs */ false,
5701 /* AllowPartialUndefs */ false))
5704 // Insert the extracted elements into the mask.
5705 for (APInt Elt : EltBits)
5706 RawMask.push_back(Elt.getZExtValue());
5711 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5712 /// Note: This ignores saturation, so inputs must be checked first.
5713 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5715 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5716 unsigned NumElts = VT.getVectorNumElements();
5717 unsigned NumLanes = VT.getSizeInBits() / 128;
5718 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5719 unsigned Offset = Unary ? 0 : NumElts;
5721 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5722 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5723 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5724 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5725 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5729 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5730 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5731 /// operands in \p Ops, and returns true.
5732 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5733 /// IsUnary for shuffles which use a single input multiple times, and in those
5734 /// cases it will adjust the mask to only have indices within that single input.
5735 /// It is an error to call this with non-empty Mask/Ops vectors.
5736 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5737 SmallVectorImpl<SDValue> &Ops,
5738 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5739 unsigned NumElems = VT.getVectorNumElements();
5742 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5743 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5746 bool IsFakeUnary = false;
5747 switch(N->getOpcode()) {
5748 case X86ISD::BLENDI:
5749 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5750 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5751 ImmN = N->getOperand(N->getNumOperands()-1);
5752 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5753 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5756 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5757 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5758 ImmN = N->getOperand(N->getNumOperands()-1);
5759 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5760 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5761 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5763 case X86ISD::INSERTPS:
5764 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5765 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5766 ImmN = N->getOperand(N->getNumOperands()-1);
5767 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5768 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5770 case X86ISD::EXTRQI:
5771 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5772 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5773 isa<ConstantSDNode>(N->getOperand(2))) {
5774 int BitLen = N->getConstantOperandVal(1);
5775 int BitIdx = N->getConstantOperandVal(2);
5776 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5781 case X86ISD::INSERTQI:
5782 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5783 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5784 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5785 isa<ConstantSDNode>(N->getOperand(3))) {
5786 int BitLen = N->getConstantOperandVal(2);
5787 int BitIdx = N->getConstantOperandVal(3);
5788 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5790 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5793 case X86ISD::UNPCKH:
5794 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5795 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5796 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5797 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5799 case X86ISD::UNPCKL:
5800 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5801 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5802 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5803 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5805 case X86ISD::MOVHLPS:
5806 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5807 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5808 DecodeMOVHLPSMask(NumElems, Mask);
5809 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5811 case X86ISD::MOVLHPS:
5812 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5813 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5814 DecodeMOVLHPSMask(NumElems, Mask);
5815 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5817 case X86ISD::PALIGNR:
5818 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5819 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5820 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5821 ImmN = N->getOperand(N->getNumOperands()-1);
5822 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5824 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5825 Ops.push_back(N->getOperand(1));
5826 Ops.push_back(N->getOperand(0));
5828 case X86ISD::VSHLDQ:
5829 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5830 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5831 ImmN = N->getOperand(N->getNumOperands() - 1);
5832 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5836 case X86ISD::VSRLDQ:
5837 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5838 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5839 ImmN = N->getOperand(N->getNumOperands() - 1);
5840 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5844 case X86ISD::PSHUFD:
5845 case X86ISD::VPERMILPI:
5846 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5847 ImmN = N->getOperand(N->getNumOperands()-1);
5848 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5849 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5852 case X86ISD::PSHUFHW:
5853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5854 ImmN = N->getOperand(N->getNumOperands()-1);
5855 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5859 case X86ISD::PSHUFLW:
5860 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5861 ImmN = N->getOperand(N->getNumOperands()-1);
5862 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5866 case X86ISD::VZEXT_MOVL:
5867 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5868 DecodeZeroMoveLowMask(NumElems, Mask);
5871 case X86ISD::VBROADCAST: {
5872 SDValue N0 = N->getOperand(0);
5873 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5874 // add the pre-extracted value to the Ops vector.
5875 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5876 N0.getOperand(0).getValueType() == VT &&
5877 N0.getConstantOperandVal(1) == 0)
5878 Ops.push_back(N0.getOperand(0));
5880 // We only decode broadcasts of same-sized vectors, unless the broadcast
5881 // came from an extract from the original width. If we found one, we
5882 // pushed it the Ops vector above.
5883 if (N0.getValueType() == VT || !Ops.empty()) {
5884 DecodeVectorBroadcast(NumElems, Mask);
5890 case X86ISD::VPERMILPV: {
5891 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5893 SDValue MaskNode = N->getOperand(1);
5894 unsigned MaskEltSize = VT.getScalarSizeInBits();
5895 SmallVector<uint64_t, 32> RawMask;
5896 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5897 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5900 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5901 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5906 case X86ISD::PSHUFB: {
5907 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5908 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5909 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5911 SDValue MaskNode = N->getOperand(1);
5912 SmallVector<uint64_t, 32> RawMask;
5913 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5914 DecodePSHUFBMask(RawMask, Mask);
5917 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5918 DecodePSHUFBMask(C, Mask);
5923 case X86ISD::VPERMI:
5924 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5925 ImmN = N->getOperand(N->getNumOperands()-1);
5926 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5931 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5932 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5933 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5935 case X86ISD::VPERM2X128:
5936 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5937 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5938 ImmN = N->getOperand(N->getNumOperands()-1);
5939 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5941 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5943 case X86ISD::SHUF128:
5944 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5945 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5946 ImmN = N->getOperand(N->getNumOperands()-1);
5947 decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
5948 cast<ConstantSDNode>(ImmN)->getZExtValue(),
5950 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5952 case X86ISD::MOVSLDUP:
5953 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5954 DecodeMOVSLDUPMask(NumElems, Mask);
5957 case X86ISD::MOVSHDUP:
5958 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5959 DecodeMOVSHDUPMask(NumElems, Mask);
5962 case X86ISD::MOVDDUP:
5963 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5964 DecodeMOVDDUPMask(NumElems, Mask);
5967 case X86ISD::VPERMIL2: {
5968 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5969 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5970 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5971 unsigned MaskEltSize = VT.getScalarSizeInBits();
5972 SDValue MaskNode = N->getOperand(2);
5973 SDValue CtrlNode = N->getOperand(3);
5974 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5975 unsigned CtrlImm = CtrlOp->getZExtValue();
5976 SmallVector<uint64_t, 32> RawMask;
5977 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5978 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5982 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5983 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5989 case X86ISD::VPPERM: {
5990 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5992 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5993 SDValue MaskNode = N->getOperand(2);
5994 SmallVector<uint64_t, 32> RawMask;
5995 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5996 DecodeVPPERMMask(RawMask, Mask);
5999 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6000 DecodeVPPERMMask(C, Mask);
6005 case X86ISD::VPERMV: {
6006 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
6008 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
6009 Ops.push_back(N->getOperand(1));
6010 SDValue MaskNode = N->getOperand(0);
6011 SmallVector<uint64_t, 32> RawMask;
6012 unsigned MaskEltSize = VT.getScalarSizeInBits();
6013 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
6014 DecodeVPERMVMask(RawMask, Mask);
6017 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6018 DecodeVPERMVMask(C, MaskEltSize, Mask);
6023 case X86ISD::VPERMV3: {
6024 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6025 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
6026 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6027 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6028 Ops.push_back(N->getOperand(0));
6029 Ops.push_back(N->getOperand(2));
6030 SDValue MaskNode = N->getOperand(1);
6031 unsigned MaskEltSize = VT.getScalarSizeInBits();
6032 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6033 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
6038 default: llvm_unreachable("unknown target shuffle node");
6041 // Empty mask indicates the decode failed.
6045 // Check if we're getting a shuffle mask with zero'd elements.
6046 if (!AllowSentinelZero)
6047 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6050 // If we have a fake unary shuffle, the shuffle mask is spread across two
6051 // inputs that are actually the same node. Re-map the mask to always point
6052 // into the first input.
6055 if (M >= (int)Mask.size())
6058 // If we didn't already add operands in the opcode-specific code, default to
6059 // adding 1 or 2 operands starting at 0.
6061 Ops.push_back(N->getOperand(0));
6062 if (!IsUnary || IsFakeUnary)
6063 Ops.push_back(N->getOperand(1));
6069 /// Check a target shuffle mask's inputs to see if we can set any values to
6070 /// SM_SentinelZero - this is for elements that are known to be zero
6071 /// (not just zeroable) from their inputs.
6072 /// Returns true if the target shuffle mask was decoded.
6073 static bool setTargetShuffleZeroElements(SDValue N,
6074 SmallVectorImpl<int> &Mask,
6075 SmallVectorImpl<SDValue> &Ops) {
6077 if (!isTargetShuffle(N.getOpcode()))
6080 MVT VT = N.getSimpleValueType();
6081 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6084 SDValue V1 = Ops[0];
6085 SDValue V2 = IsUnary ? V1 : Ops[1];
6087 V1 = peekThroughBitcasts(V1);
6088 V2 = peekThroughBitcasts(V2);
6090 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6091 "Illegal split of shuffle value type");
6092 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6094 // Extract known constant input data.
6095 APInt UndefSrcElts[2];
6096 SmallVector<APInt, 32> SrcEltBits[2];
6097 bool IsSrcConstant[2] = {
6098 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6099 SrcEltBits[0], true, false),
6100 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6101 SrcEltBits[1], true, false)};
6103 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6106 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6110 // Determine shuffle input and normalize the mask.
6111 unsigned SrcIdx = M / Size;
6112 SDValue V = M < Size ? V1 : V2;
6115 // We are referencing an UNDEF input.
6117 Mask[i] = SM_SentinelUndef;
6121 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6122 // TODO: We currently only set UNDEF for integer types - floats use the same
6123 // registers as vectors and many of the scalar folded loads rely on the
6124 // SCALAR_TO_VECTOR pattern.
6125 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6126 (Size % V.getValueType().getVectorNumElements()) == 0) {
6127 int Scale = Size / V.getValueType().getVectorNumElements();
6128 int Idx = M / Scale;
6129 if (Idx != 0 && !VT.isFloatingPoint())
6130 Mask[i] = SM_SentinelUndef;
6131 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6132 Mask[i] = SM_SentinelZero;
6136 // Attempt to extract from the source's constant bits.
6137 if (IsSrcConstant[SrcIdx]) {
6138 if (UndefSrcElts[SrcIdx][M])
6139 Mask[i] = SM_SentinelUndef;
6140 else if (SrcEltBits[SrcIdx][M] == 0)
6141 Mask[i] = SM_SentinelZero;
6145 assert(VT.getVectorNumElements() == Mask.size() &&
6146 "Different mask size from vector size!");
6150 // Attempt to decode ops that could be represented as a shuffle mask.
6151 // The decoded shuffle mask may contain a different number of elements to the
6152 // destination value type.
6153 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6154 SmallVectorImpl<SDValue> &Ops,
6155 const SelectionDAG &DAG) {
6159 MVT VT = N.getSimpleValueType();
6160 unsigned NumElts = VT.getVectorNumElements();
6161 unsigned NumSizeInBits = VT.getSizeInBits();
6162 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6163 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6164 "Expected byte aligned value types");
6166 unsigned Opcode = N.getOpcode();
6168 case ISD::VECTOR_SHUFFLE: {
6169 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6170 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6171 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6172 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6173 Ops.push_back(N.getOperand(0));
6174 Ops.push_back(N.getOperand(1));
6180 case X86ISD::ANDNP: {
6181 // Attempt to decode as a per-byte mask.
6183 SmallVector<APInt, 32> EltBits;
6184 SDValue N0 = N.getOperand(0);
6185 SDValue N1 = N.getOperand(1);
6186 bool IsAndN = (X86ISD::ANDNP == Opcode);
6187 uint64_t ZeroMask = IsAndN ? 255 : 0;
6188 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6190 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6192 Mask.push_back(SM_SentinelUndef);
6195 uint64_t ByteBits = EltBits[i].getZExtValue();
6196 if (ByteBits != 0 && ByteBits != 255)
6198 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6200 Ops.push_back(IsAndN ? N1 : N0);
6203 case ISD::SCALAR_TO_VECTOR: {
6204 // Match against a scalar_to_vector of an extract from a vector,
6205 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6206 SDValue N0 = N.getOperand(0);
6209 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6210 N0.getOperand(0).getValueType() == VT) ||
6211 (N0.getOpcode() == X86ISD::PEXTRW &&
6212 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6213 (N0.getOpcode() == X86ISD::PEXTRB &&
6214 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6218 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6221 SDValue SrcVec = SrcExtract.getOperand(0);
6222 EVT SrcVT = SrcVec.getValueType();
6223 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6224 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6226 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6227 if (NumSrcElts <= SrcIdx)
6230 Ops.push_back(SrcVec);
6231 Mask.push_back(SrcIdx);
6232 Mask.append(NumZeros, SM_SentinelZero);
6233 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6236 case X86ISD::PINSRB:
6237 case X86ISD::PINSRW: {
6238 SDValue InVec = N.getOperand(0);
6239 SDValue InScl = N.getOperand(1);
6240 SDValue InIndex = N.getOperand(2);
6241 if (!isa<ConstantSDNode>(InIndex) ||
6242 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6244 uint64_t InIdx = N.getConstantOperandVal(2);
6246 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6247 if (X86::isZeroNode(InScl)) {
6248 Ops.push_back(InVec);
6249 for (unsigned i = 0; i != NumElts; ++i)
6250 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6254 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6255 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6257 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6258 if (InScl.getOpcode() != ExOp)
6261 SDValue ExVec = InScl.getOperand(0);
6262 SDValue ExIndex = InScl.getOperand(1);
6263 if (!isa<ConstantSDNode>(ExIndex) ||
6264 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6266 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6268 Ops.push_back(InVec);
6269 Ops.push_back(ExVec);
6270 for (unsigned i = 0; i != NumElts; ++i)
6271 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6274 case X86ISD::PACKSS:
6275 case X86ISD::PACKUS: {
6276 SDValue N0 = N.getOperand(0);
6277 SDValue N1 = N.getOperand(1);
6278 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6279 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6280 "Unexpected input value type");
6282 // If we know input saturation won't happen we can treat this
6283 // as a truncation shuffle.
6284 if (Opcode == X86ISD::PACKSS) {
6285 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6286 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6289 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6290 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6291 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6295 bool IsUnary = (N0 == N1);
6301 createPackShuffleMask(VT, Mask, IsUnary);
6305 case X86ISD::VSRLI: {
6306 uint64_t ShiftVal = N.getConstantOperandVal(1);
6307 // Out of range bit shifts are guaranteed to be zero.
6308 if (NumBitsPerElt <= ShiftVal) {
6309 Mask.append(NumElts, SM_SentinelZero);
6313 // We can only decode 'whole byte' bit shifts as shuffles.
6314 if ((ShiftVal % 8) != 0)
6317 uint64_t ByteShift = ShiftVal / 8;
6318 unsigned NumBytes = NumSizeInBits / 8;
6319 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6320 Ops.push_back(N.getOperand(0));
6322 // Clear mask to all zeros and insert the shifted byte indices.
6323 Mask.append(NumBytes, SM_SentinelZero);
6325 if (X86ISD::VSHLI == Opcode) {
6326 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6327 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6328 Mask[i + j] = i + j - ByteShift;
6330 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6331 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6332 Mask[i + j - ByteShift] = i + j;
6336 case ISD::ZERO_EXTEND_VECTOR_INREG:
6337 case X86ISD::VZEXT: {
6338 // TODO - add support for VPMOVZX with smaller input vector types.
6339 SDValue Src = N.getOperand(0);
6340 MVT SrcVT = Src.getSimpleValueType();
6341 if (NumSizeInBits != SrcVT.getSizeInBits())
6343 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6344 VT.getVectorNumElements(), Mask);
6353 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6354 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6355 SmallVectorImpl<int> &Mask) {
6356 int MaskWidth = Mask.size();
6357 SmallVector<SDValue, 16> UsedInputs;
6358 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6359 int lo = UsedInputs.size() * MaskWidth;
6360 int hi = lo + MaskWidth;
6362 // Strip UNDEF input usage.
6363 if (Inputs[i].isUndef())
6365 if ((lo <= M) && (M < hi))
6366 M = SM_SentinelUndef;
6368 // Check for unused inputs.
6369 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6370 UsedInputs.push_back(Inputs[i]);
6377 Inputs = UsedInputs;
6380 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6381 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6382 /// remaining input indices in case we now have a unary shuffle and adjust the
6383 /// inputs accordingly.
6384 /// Returns true if the target shuffle mask was decoded.
6385 static bool resolveTargetShuffleInputs(SDValue Op,
6386 SmallVectorImpl<SDValue> &Inputs,
6387 SmallVectorImpl<int> &Mask,
6388 const SelectionDAG &DAG) {
6389 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6390 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6393 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6397 /// Returns the scalar element that will make up the ith
6398 /// element of the result of the vector shuffle.
6399 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6402 return SDValue(); // Limit search depth.
6404 SDValue V = SDValue(N, 0);
6405 EVT VT = V.getValueType();
6406 unsigned Opcode = V.getOpcode();
6408 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6409 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6410 int Elt = SV->getMaskElt(Index);
6413 return DAG.getUNDEF(VT.getVectorElementType());
6415 unsigned NumElems = VT.getVectorNumElements();
6416 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6417 : SV->getOperand(1);
6418 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6421 // Recurse into target specific vector shuffles to find scalars.
6422 if (isTargetShuffle(Opcode)) {
6423 MVT ShufVT = V.getSimpleValueType();
6424 MVT ShufSVT = ShufVT.getVectorElementType();
6425 int NumElems = (int)ShufVT.getVectorNumElements();
6426 SmallVector<int, 16> ShuffleMask;
6427 SmallVector<SDValue, 16> ShuffleOps;
6430 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6433 int Elt = ShuffleMask[Index];
6434 if (Elt == SM_SentinelZero)
6435 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6436 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6437 if (Elt == SM_SentinelUndef)
6438 return DAG.getUNDEF(ShufSVT);
6440 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6441 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6442 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6446 // Actual nodes that may contain scalar elements
6447 if (Opcode == ISD::BITCAST) {
6448 V = V.getOperand(0);
6449 EVT SrcVT = V.getValueType();
6450 unsigned NumElems = VT.getVectorNumElements();
6452 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6456 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6457 return (Index == 0) ? V.getOperand(0)
6458 : DAG.getUNDEF(VT.getVectorElementType());
6460 if (V.getOpcode() == ISD::BUILD_VECTOR)
6461 return V.getOperand(Index);
6466 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6467 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6468 unsigned NumNonZero, unsigned NumZero,
6470 const X86Subtarget &Subtarget) {
6471 MVT VT = Op.getSimpleValueType();
6472 unsigned NumElts = VT.getVectorNumElements();
6473 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6474 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6475 "Illegal vector insertion");
6481 for (unsigned i = 0; i < NumElts; ++i) {
6482 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6486 // If the build vector contains zeros or our first insertion is not the
6487 // first index then insert into zero vector to break any register
6488 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6491 if (NumZero || 0 != i)
6492 V = getZeroVector(VT, Subtarget, DAG, dl);
6494 assert(0 == i && "Expected insertion into zero-index");
6495 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6496 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6497 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6498 V = DAG.getBitcast(VT, V);
6502 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6503 DAG.getIntPtrConstant(i, dl));
6509 /// Custom lower build_vector of v16i8.
6510 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6511 unsigned NumNonZero, unsigned NumZero,
6513 const X86Subtarget &Subtarget) {
6514 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6517 // SSE4.1 - use PINSRB to insert each byte directly.
6518 if (Subtarget.hasSSE41())
6519 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6526 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6527 for (unsigned i = 0; i < 16; ++i) {
6528 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6529 if (ThisIsNonZero && First) {
6531 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6533 V = DAG.getUNDEF(MVT::v8i16);
6538 // FIXME: Investigate extending to i32 instead of just i16.
6539 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6540 SDValue ThisElt, LastElt;
6541 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6542 if (LastIsNonZero) {
6544 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6546 if (ThisIsNonZero) {
6547 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6548 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6549 DAG.getConstant(8, dl, MVT::i8));
6551 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6557 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6558 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6559 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6560 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6561 V = DAG.getBitcast(MVT::v8i16, V);
6563 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6564 DAG.getIntPtrConstant(i / 2, dl));
6570 return DAG.getBitcast(MVT::v16i8, V);
6573 /// Custom lower build_vector of v8i16.
6574 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6575 unsigned NumNonZero, unsigned NumZero,
6577 const X86Subtarget &Subtarget) {
6578 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6581 // Use PINSRW to insert each byte directly.
6582 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6586 /// Custom lower build_vector of v4i32 or v4f32.
6587 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6588 const X86Subtarget &Subtarget) {
6589 // Find all zeroable elements.
6590 std::bitset<4> Zeroable;
6591 for (int i=0; i < 4; ++i) {
6592 SDValue Elt = Op->getOperand(i);
6593 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6595 assert(Zeroable.size() - Zeroable.count() > 1 &&
6596 "We expect at least two non-zero elements!");
6598 // We only know how to deal with build_vector nodes where elements are either
6599 // zeroable or extract_vector_elt with constant index.
6600 SDValue FirstNonZero;
6601 unsigned FirstNonZeroIdx;
6602 for (unsigned i=0; i < 4; ++i) {
6605 SDValue Elt = Op->getOperand(i);
6606 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6607 !isa<ConstantSDNode>(Elt.getOperand(1)))
6609 // Make sure that this node is extracting from a 128-bit vector.
6610 MVT VT = Elt.getOperand(0).getSimpleValueType();
6611 if (!VT.is128BitVector())
6613 if (!FirstNonZero.getNode()) {
6615 FirstNonZeroIdx = i;
6619 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6620 SDValue V1 = FirstNonZero.getOperand(0);
6621 MVT VT = V1.getSimpleValueType();
6623 // See if this build_vector can be lowered as a blend with zero.
6625 unsigned EltMaskIdx, EltIdx;
6627 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6628 if (Zeroable[EltIdx]) {
6629 // The zero vector will be on the right hand side.
6630 Mask[EltIdx] = EltIdx+4;
6634 Elt = Op->getOperand(EltIdx);
6635 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6636 EltMaskIdx = Elt.getConstantOperandVal(1);
6637 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6639 Mask[EltIdx] = EltIdx;
6643 // Let the shuffle legalizer deal with blend operations.
6644 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6645 if (V1.getSimpleValueType() != VT)
6646 V1 = DAG.getBitcast(VT, V1);
6647 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6650 // See if we can lower this build_vector to a INSERTPS.
6651 if (!Subtarget.hasSSE41())
6654 SDValue V2 = Elt.getOperand(0);
6655 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6658 bool CanFold = true;
6659 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6663 SDValue Current = Op->getOperand(i);
6664 SDValue SrcVector = Current->getOperand(0);
6667 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6673 assert(V1.getNode() && "Expected at least two non-zero elements!");
6674 if (V1.getSimpleValueType() != MVT::v4f32)
6675 V1 = DAG.getBitcast(MVT::v4f32, V1);
6676 if (V2.getSimpleValueType() != MVT::v4f32)
6677 V2 = DAG.getBitcast(MVT::v4f32, V2);
6679 // Ok, we can emit an INSERTPS instruction.
6680 unsigned ZMask = Zeroable.to_ulong();
6682 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6683 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6685 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6686 DAG.getIntPtrConstant(InsertPSMask, DL));
6687 return DAG.getBitcast(VT, Result);
6690 /// Return a vector logical shift node.
6691 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6692 SelectionDAG &DAG, const TargetLowering &TLI,
6694 assert(VT.is128BitVector() && "Unknown type for VShift");
6695 MVT ShVT = MVT::v16i8;
6696 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6697 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6698 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6699 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
6700 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6703 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6704 SelectionDAG &DAG) {
6706 // Check if the scalar load can be widened into a vector load. And if
6707 // the address is "base + cst" see if the cst can be "absorbed" into
6708 // the shuffle mask.
6709 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6710 SDValue Ptr = LD->getBasePtr();
6711 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6713 EVT PVT = LD->getValueType(0);
6714 if (PVT != MVT::i32 && PVT != MVT::f32)
6719 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6720 FI = FINode->getIndex();
6722 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6723 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6724 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6725 Offset = Ptr.getConstantOperandVal(1);
6726 Ptr = Ptr.getOperand(0);
6731 // FIXME: 256-bit vector instructions don't require a strict alignment,
6732 // improve this code to support it better.
6733 unsigned RequiredAlign = VT.getSizeInBits()/8;
6734 SDValue Chain = LD->getChain();
6735 // Make sure the stack object alignment is at least 16 or 32.
6736 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6737 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6738 if (MFI.isFixedObjectIndex(FI)) {
6739 // Can't change the alignment. FIXME: It's possible to compute
6740 // the exact stack offset and reference FI + adjust offset instead.
6741 // If someone *really* cares about this. That's the way to implement it.
6744 MFI.setObjectAlignment(FI, RequiredAlign);
6748 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6749 // Ptr + (Offset & ~15).
6752 if ((Offset % RequiredAlign) & 3)
6754 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6757 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6758 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6761 int EltNo = (Offset - StartOffset) >> 2;
6762 unsigned NumElems = VT.getVectorNumElements();
6764 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6765 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6766 LD->getPointerInfo().getWithOffset(StartOffset));
6768 SmallVector<int, 8> Mask(NumElems, EltNo);
6770 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6776 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6777 /// elements can be replaced by a single large load which has the same value as
6778 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6780 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6781 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6782 const SDLoc &DL, SelectionDAG &DAG,
6783 const X86Subtarget &Subtarget,
6784 bool isAfterLegalize) {
6785 unsigned NumElems = Elts.size();
6787 int LastLoadedElt = -1;
6788 SmallBitVector LoadMask(NumElems, false);
6789 SmallBitVector ZeroMask(NumElems, false);
6790 SmallBitVector UndefMask(NumElems, false);
6792 // For each element in the initializer, see if we've found a load, zero or an
6794 for (unsigned i = 0; i < NumElems; ++i) {
6795 SDValue Elt = peekThroughBitcasts(Elts[i]);
6800 UndefMask[i] = true;
6801 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6803 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6806 // Each loaded element must be the correct fractional portion of the
6807 // requested vector load.
6808 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6813 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6814 "Incomplete element masks");
6816 // Handle Special Cases - all undef or undef/zero.
6817 if (UndefMask.count() == NumElems)
6818 return DAG.getUNDEF(VT);
6820 // FIXME: Should we return this as a BUILD_VECTOR instead?
6821 if ((ZeroMask | UndefMask).count() == NumElems)
6822 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6823 : DAG.getConstantFP(0.0, DL, VT);
6825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6826 int FirstLoadedElt = LoadMask.find_first();
6827 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6828 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6829 EVT LDBaseVT = EltBase.getValueType();
6831 // Consecutive loads can contain UNDEFS but not ZERO elements.
6832 // Consecutive loads with UNDEFs and ZEROs elements require a
6833 // an additional shuffle stage to clear the ZERO elements.
6834 bool IsConsecutiveLoad = true;
6835 bool IsConsecutiveLoadWithZeros = true;
6836 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6838 SDValue Elt = peekThroughBitcasts(Elts[i]);
6839 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6840 if (!DAG.areNonVolatileConsecutiveLoads(
6841 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6842 i - FirstLoadedElt)) {
6843 IsConsecutiveLoad = false;
6844 IsConsecutiveLoadWithZeros = false;
6847 } else if (ZeroMask[i]) {
6848 IsConsecutiveLoad = false;
6852 SmallVector<LoadSDNode *, 8> Loads;
6853 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6855 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6857 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6858 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6859 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6860 "Cannot merge volatile loads.");
6862 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6863 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6864 for (auto *LD : Loads)
6865 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6869 // LOAD - all consecutive load/undefs (must start/end with a load).
6870 // If we have found an entire vector of loads and undefs, then return a large
6871 // load of the entire vector width starting at the base pointer.
6872 // If the vector contains zeros, then attempt to shuffle those elements.
6873 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6874 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6875 assert(LDBase && "Did not find base load for merging consecutive loads");
6876 EVT EltVT = LDBase->getValueType(0);
6877 // Ensure that the input vector size for the merged loads matches the
6878 // cumulative size of the input elements.
6879 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6882 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6885 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6886 // will lower to regular temporal loads and use the cache.
6887 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6888 VT.is256BitVector() && !Subtarget.hasInt256())
6891 if (IsConsecutiveLoad)
6892 return CreateLoad(VT, LDBase);
6894 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6895 // vector and a zero vector to clear out the zero elements.
6896 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6897 SmallVector<int, 4> ClearMask(NumElems, -1);
6898 for (unsigned i = 0; i < NumElems; ++i) {
6900 ClearMask[i] = i + NumElems;
6901 else if (LoadMask[i])
6904 SDValue V = CreateLoad(VT, LDBase);
6905 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6906 : DAG.getConstantFP(0.0, DL, VT);
6907 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6912 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6914 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6915 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6916 (LoadSize == 32 || LoadSize == 64) &&
6917 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6918 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6919 : MVT::getIntegerVT(LoadSize);
6920 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6921 if (TLI.isTypeLegal(VecVT)) {
6922 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6923 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6925 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6926 LDBase->getPointerInfo(),
6927 LDBase->getAlignment(),
6928 MachineMemOperand::MOLoad);
6929 for (auto *LD : Loads)
6930 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6931 return DAG.getBitcast(VT, ResNode);
6938 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6939 unsigned SplatBitSize, LLVMContext &C) {
6940 unsigned ScalarSize = VT.getScalarSizeInBits();
6941 unsigned NumElm = SplatBitSize / ScalarSize;
6943 SmallVector<Constant *, 32> ConstantVec;
6944 for (unsigned i = 0; i < NumElm; i++) {
6945 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6947 if (VT.isFloatingPoint()) {
6948 if (ScalarSize == 32) {
6949 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6951 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6952 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6955 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6956 ConstantVec.push_back(Const);
6958 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6961 static bool isUseOfShuffle(SDNode *N) {
6962 for (auto *U : N->uses()) {
6963 if (isTargetShuffle(U->getOpcode()))
6965 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6966 return isUseOfShuffle(U);
6971 // Check if the current node of build vector is a zero extended vector.
6972 // // If so, return the value extended.
6973 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6974 // // NumElt - return the number of zero extended identical values.
6975 // // EltType - return the type of the value include the zero extend.
6976 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6977 unsigned &NumElt, MVT &EltType) {
6978 SDValue ExtValue = Op->getOperand(0);
6979 unsigned NumElts = Op->getNumOperands();
6980 unsigned Delta = NumElts;
6982 for (unsigned i = 1; i < NumElts; i++) {
6983 if (Op->getOperand(i) == ExtValue) {
6987 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6990 if (!isPowerOf2_32(Delta) || Delta == 1)
6993 for (unsigned i = Delta; i < NumElts; i++) {
6994 if (i % Delta == 0) {
6995 if (Op->getOperand(i) != ExtValue)
6997 } else if (!(isNullConstant(Op->getOperand(i)) ||
6998 Op->getOperand(i).isUndef()))
7001 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
7002 unsigned ExtVTSize = EltSize * Delta;
7003 EltType = MVT::getIntegerVT(ExtVTSize);
7004 NumElt = NumElts / Delta;
7008 /// Attempt to use the vbroadcast instruction to generate a splat value
7009 /// from a splat BUILD_VECTOR which uses:
7010 /// a. A single scalar load, or a constant.
7011 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7013 /// The VBROADCAST node is returned when a pattern is found,
7014 /// or SDValue() otherwise.
7015 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7016 const X86Subtarget &Subtarget,
7017 SelectionDAG &DAG) {
7018 // VBROADCAST requires AVX.
7019 // TODO: Splats could be generated for non-AVX CPUs using SSE
7020 // instructions, but there's less potential gain for only 128-bit vectors.
7021 if (!Subtarget.hasAVX())
7024 MVT VT = BVOp->getSimpleValueType(0);
7027 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7028 "Unsupported vector type for broadcast.");
7030 BitVector UndefElements;
7031 SDValue Ld = BVOp->getSplatValue(&UndefElements);
7033 // Attempt to use VBROADCASTM
7034 // From this paterrn:
7035 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7036 // b. t1 = (build_vector t0 t0)
7038 // Create (VBROADCASTM v2i1 X)
7039 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7040 MVT EltType = VT.getScalarType();
7041 unsigned NumElts = VT.getVectorNumElements();
7043 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7044 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7045 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7046 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7048 BOperand = ZeroExtended.getOperand(0);
7050 BOperand = Ld.getOperand(0).getOperand(0);
7051 MVT MaskVT = BOperand.getSimpleValueType();
7052 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7053 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7055 DAG.getNode(X86ISD::VBROADCASTM, dl,
7056 MVT::getVectorVT(EltType, NumElts), BOperand);
7057 return DAG.getBitcast(VT, Brdcst);
7062 // We need a splat of a single value to use broadcast, and it doesn't
7063 // make any sense if the value is only in one element of the vector.
7064 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
7065 APInt SplatValue, Undef;
7066 unsigned SplatBitSize;
7068 // Check if this is a repeated constant pattern suitable for broadcasting.
7069 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7070 SplatBitSize > VT.getScalarSizeInBits() &&
7071 SplatBitSize < VT.getSizeInBits()) {
7072 // Avoid replacing with broadcast when it's a use of a shuffle
7073 // instruction to preserve the present custom lowering of shuffles.
7074 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7076 // replace BUILD_VECTOR with broadcast of the repeated constants.
7077 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7078 LLVMContext *Ctx = DAG.getContext();
7079 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7080 if (Subtarget.hasAVX()) {
7081 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7082 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7083 // Splatted value can fit in one INTEGER constant in constant pool.
7084 // Load the constant and broadcast it.
7085 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7086 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7087 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7088 SDValue CP = DAG.getConstantPool(C, PVT);
7089 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7091 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7093 CVT, dl, DAG.getEntryNode(), CP,
7094 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7096 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7097 MVT::getVectorVT(CVT, Repeat), Ld);
7098 return DAG.getBitcast(VT, Brdcst);
7099 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7100 // Splatted value can fit in one FLOAT constant in constant pool.
7101 // Load the constant and broadcast it.
7102 // AVX have support for 32 and 64 bit broadcast for floats only.
7103 // No 64bit integer in 32bit subtarget.
7104 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7105 // Lower the splat via APFloat directly, to avoid any conversion.
7108 ? ConstantFP::get(*Ctx,
7109 APFloat(APFloat::IEEEsingle(), SplatValue))
7110 : ConstantFP::get(*Ctx,
7111 APFloat(APFloat::IEEEdouble(), SplatValue));
7112 SDValue CP = DAG.getConstantPool(C, PVT);
7113 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7115 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7117 CVT, dl, DAG.getEntryNode(), CP,
7118 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7120 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7121 MVT::getVectorVT(CVT, Repeat), Ld);
7122 return DAG.getBitcast(VT, Brdcst);
7123 } else if (SplatBitSize > 64) {
7124 // Load the vector of constants and broadcast it.
7125 MVT CVT = VT.getScalarType();
7126 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7128 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7129 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7130 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7132 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7133 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7135 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7136 return DAG.getBitcast(VT, Brdcst);
7143 bool ConstSplatVal =
7144 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7146 // Make sure that all of the users of a non-constant load are from the
7147 // BUILD_VECTOR node.
7148 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7151 unsigned ScalarSize = Ld.getValueSizeInBits();
7152 bool IsGE256 = (VT.getSizeInBits() >= 256);
7154 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7155 // instruction to save 8 or more bytes of constant pool data.
7156 // TODO: If multiple splats are generated to load the same constant,
7157 // it may be detrimental to overall size. There needs to be a way to detect
7158 // that condition to know if this is truly a size win.
7159 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7161 // Handle broadcasting a single constant scalar from the constant pool
7163 // On Sandybridge (no AVX2), it is still better to load a constant vector
7164 // from the constant pool and not to broadcast it from a scalar.
7165 // But override that restriction when optimizing for size.
7166 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7167 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7168 EVT CVT = Ld.getValueType();
7169 assert(!CVT.isVector() && "Must not broadcast a vector type");
7171 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7172 // For size optimization, also splat v2f64 and v2i64, and for size opt
7173 // with AVX2, also splat i8 and i16.
7174 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7175 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7176 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7177 const Constant *C = nullptr;
7178 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7179 C = CI->getConstantIntValue();
7180 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7181 C = CF->getConstantFPValue();
7183 assert(C && "Invalid constant type");
7185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7187 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7188 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7190 CVT, dl, DAG.getEntryNode(), CP,
7191 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7194 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7198 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7200 // Handle AVX2 in-register broadcasts.
7201 if (!IsLoad && Subtarget.hasInt256() &&
7202 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7203 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7205 // The scalar source must be a normal load.
7209 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7210 (Subtarget.hasVLX() && ScalarSize == 64))
7211 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7213 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7214 // double since there is no vbroadcastsd xmm
7215 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7216 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7217 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7220 // Unsupported broadcast.
7224 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7225 /// underlying vector and index.
7227 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7229 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7231 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7232 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7235 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7237 // (extract_vector_elt (v8f32 %1), Constant<6>)
7239 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7240 // (extract_subvector (v8f32 %0), Constant<4>),
7243 // In this case the vector is the extract_subvector expression and the index
7244 // is 2, as specified by the shuffle.
7245 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7246 SDValue ShuffleVec = SVOp->getOperand(0);
7247 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7248 assert(ShuffleVecVT.getVectorElementType() ==
7249 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7251 int ShuffleIdx = SVOp->getMaskElt(Idx);
7252 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7253 ExtractedFromVec = ShuffleVec;
7259 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7260 MVT VT = Op.getSimpleValueType();
7262 // Skip if insert_vec_elt is not supported.
7263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7264 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7268 unsigned NumElems = Op.getNumOperands();
7272 SmallVector<unsigned, 4> InsertIndices;
7273 SmallVector<int, 8> Mask(NumElems, -1);
7275 for (unsigned i = 0; i != NumElems; ++i) {
7276 unsigned Opc = Op.getOperand(i).getOpcode();
7278 if (Opc == ISD::UNDEF)
7281 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7282 // Quit if more than 1 elements need inserting.
7283 if (InsertIndices.size() > 1)
7286 InsertIndices.push_back(i);
7290 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7291 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7293 // Quit if non-constant index.
7294 if (!isa<ConstantSDNode>(ExtIdx))
7296 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7298 // Quit if extracted from vector of different type.
7299 if (ExtractedFromVec.getValueType() != VT)
7302 if (!VecIn1.getNode())
7303 VecIn1 = ExtractedFromVec;
7304 else if (VecIn1 != ExtractedFromVec) {
7305 if (!VecIn2.getNode())
7306 VecIn2 = ExtractedFromVec;
7307 else if (VecIn2 != ExtractedFromVec)
7308 // Quit if more than 2 vectors to shuffle
7312 if (ExtractedFromVec == VecIn1)
7314 else if (ExtractedFromVec == VecIn2)
7315 Mask[i] = Idx + NumElems;
7318 if (!VecIn1.getNode())
7321 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7322 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7324 for (unsigned Idx : InsertIndices)
7325 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7326 DAG.getIntPtrConstant(Idx, DL));
7331 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7332 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7333 Op.getScalarValueSizeInBits() == 1 &&
7334 "Can not convert non-constant vector");
7335 uint64_t Immediate = 0;
7336 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7337 SDValue In = Op.getOperand(idx);
7339 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7342 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7343 return DAG.getConstant(Immediate, dl, VT);
7345 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7346 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7347 const X86Subtarget &Subtarget) {
7349 MVT VT = Op.getSimpleValueType();
7350 assert((VT.getVectorElementType() == MVT::i1) &&
7351 "Unexpected type in LowerBUILD_VECTORvXi1!");
7354 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7357 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7360 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7361 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7362 // Split the pieces.
7364 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7366 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7367 // We have to manually lower both halves so getNode doesn't try to
7368 // reassemble the build_vector.
7369 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7370 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7371 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7373 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7374 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7375 return DAG.getBitcast(VT, Imm);
7376 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7378 DAG.getIntPtrConstant(0, dl));
7381 // Vector has one or more non-const elements
7382 uint64_t Immediate = 0;
7383 SmallVector<unsigned, 16> NonConstIdx;
7384 bool IsSplat = true;
7385 bool HasConstElts = false;
7387 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7388 SDValue In = Op.getOperand(idx);
7391 if (!isa<ConstantSDNode>(In))
7392 NonConstIdx.push_back(idx);
7394 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7395 HasConstElts = true;
7399 else if (In != Op.getOperand(SplatIdx))
7403 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7405 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7406 DAG.getConstant(1, dl, VT),
7407 DAG.getConstant(0, dl, VT));
7409 // insert elements one by one
7413 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7414 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7416 else if (HasConstElts)
7417 Imm = DAG.getConstant(0, dl, VT);
7419 Imm = DAG.getUNDEF(VT);
7420 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7421 DstVec = DAG.getBitcast(VT, Imm);
7423 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7424 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7425 DAG.getIntPtrConstant(0, dl));
7428 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7429 unsigned InsertIdx = NonConstIdx[i];
7430 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7431 Op.getOperand(InsertIdx),
7432 DAG.getIntPtrConstant(InsertIdx, dl));
7437 /// Return true if \p N implements a horizontal binop and return the
7438 /// operands for the horizontal binop into V0 and V1.
7440 /// This is a helper function of LowerToHorizontalOp().
7441 /// This function checks that the build_vector \p N in input implements a
7442 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7443 /// operation to match.
7444 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7445 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7446 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7449 /// This function only analyzes elements of \p N whose indices are
7450 /// in range [BaseIdx, LastIdx).
7451 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7453 unsigned BaseIdx, unsigned LastIdx,
7454 SDValue &V0, SDValue &V1) {
7455 EVT VT = N->getValueType(0);
7457 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7458 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7459 "Invalid Vector in input!");
7461 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7462 bool CanFold = true;
7463 unsigned ExpectedVExtractIdx = BaseIdx;
7464 unsigned NumElts = LastIdx - BaseIdx;
7465 V0 = DAG.getUNDEF(VT);
7466 V1 = DAG.getUNDEF(VT);
7468 // Check if N implements a horizontal binop.
7469 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7470 SDValue Op = N->getOperand(i + BaseIdx);
7473 if (Op->isUndef()) {
7474 // Update the expected vector extract index.
7475 if (i * 2 == NumElts)
7476 ExpectedVExtractIdx = BaseIdx;
7477 ExpectedVExtractIdx += 2;
7481 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7486 SDValue Op0 = Op.getOperand(0);
7487 SDValue Op1 = Op.getOperand(1);
7489 // Try to match the following pattern:
7490 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7491 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7492 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7493 Op0.getOperand(0) == Op1.getOperand(0) &&
7494 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7495 isa<ConstantSDNode>(Op1.getOperand(1)));
7499 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7500 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7502 if (i * 2 < NumElts) {
7504 V0 = Op0.getOperand(0);
7505 if (V0.getValueType() != VT)
7510 V1 = Op0.getOperand(0);
7511 if (V1.getValueType() != VT)
7514 if (i * 2 == NumElts)
7515 ExpectedVExtractIdx = BaseIdx;
7518 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7519 if (I0 == ExpectedVExtractIdx)
7520 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7521 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7522 // Try to match the following dag sequence:
7523 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7524 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7528 ExpectedVExtractIdx += 2;
7534 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7535 /// a concat_vector.
7537 /// This is a helper function of LowerToHorizontalOp().
7538 /// This function expects two 256-bit vectors called V0 and V1.
7539 /// At first, each vector is split into two separate 128-bit vectors.
7540 /// Then, the resulting 128-bit vectors are used to implement two
7541 /// horizontal binary operations.
7543 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7545 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7546 /// the two new horizontal binop.
7547 /// When Mode is set, the first horizontal binop dag node would take as input
7548 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7549 /// horizontal binop dag node would take as input the lower 128-bit of V1
7550 /// and the upper 128-bit of V1.
7552 /// HADD V0_LO, V0_HI
7553 /// HADD V1_LO, V1_HI
7555 /// Otherwise, the first horizontal binop dag node takes as input the lower
7556 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7557 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7559 /// HADD V0_LO, V1_LO
7560 /// HADD V0_HI, V1_HI
7562 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7563 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7564 /// the upper 128-bits of the result.
7565 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7566 const SDLoc &DL, SelectionDAG &DAG,
7567 unsigned X86Opcode, bool Mode,
7568 bool isUndefLO, bool isUndefHI) {
7569 MVT VT = V0.getSimpleValueType();
7570 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7571 "Invalid nodes in input!");
7573 unsigned NumElts = VT.getVectorNumElements();
7574 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7575 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7576 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7577 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7578 MVT NewVT = V0_LO.getSimpleValueType();
7580 SDValue LO = DAG.getUNDEF(NewVT);
7581 SDValue HI = DAG.getUNDEF(NewVT);
7584 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7585 if (!isUndefLO && !V0->isUndef())
7586 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7587 if (!isUndefHI && !V1->isUndef())
7588 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7590 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7591 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7592 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7594 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7595 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7598 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7601 /// Returns true iff \p BV builds a vector with the result equivalent to
7602 /// the result of ADDSUB/SUBADD operation.
7603 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7604 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7605 /// \p Opnd0 and \p Opnd1.
7606 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7607 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7608 SDValue &Opnd0, SDValue &Opnd1,
7609 unsigned &NumExtracts,
7612 MVT VT = BV->getSimpleValueType(0);
7613 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7616 unsigned NumElts = VT.getVectorNumElements();
7617 SDValue InVec0 = DAG.getUNDEF(VT);
7618 SDValue InVec1 = DAG.getUNDEF(VT);
7622 // Odd-numbered elements in the input build vector are obtained from
7623 // adding/subtracting two integer/float elements.
7624 // Even-numbered elements in the input build vector are obtained from
7625 // subtracting/adding two integer/float elements.
7626 unsigned Opc[2] {0, 0};
7627 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7628 SDValue Op = BV->getOperand(i);
7630 // Skip 'undef' values.
7631 unsigned Opcode = Op.getOpcode();
7632 if (Opcode == ISD::UNDEF)
7635 // Early exit if we found an unexpected opcode.
7636 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7639 SDValue Op0 = Op.getOperand(0);
7640 SDValue Op1 = Op.getOperand(1);
7642 // Try to match the following pattern:
7643 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7644 // Early exit if we cannot match that sequence.
7645 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7646 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7647 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7648 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7649 Op0.getOperand(1) != Op1.getOperand(1))
7652 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7656 // We found a valid add/sub node, make sure its the same opcode as previous
7657 // elements for this parity.
7658 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7660 Opc[i % 2] = Opcode;
7662 // Update InVec0 and InVec1.
7663 if (InVec0.isUndef()) {
7664 InVec0 = Op0.getOperand(0);
7665 if (InVec0.getSimpleValueType() != VT)
7668 if (InVec1.isUndef()) {
7669 InVec1 = Op1.getOperand(0);
7670 if (InVec1.getSimpleValueType() != VT)
7674 // Make sure that operands in input to each add/sub node always
7675 // come from a same pair of vectors.
7676 if (InVec0 != Op0.getOperand(0)) {
7677 if (Opcode == ISD::FSUB)
7680 // FADD is commutable. Try to commute the operands
7681 // and then test again.
7682 std::swap(Op0, Op1);
7683 if (InVec0 != Op0.getOperand(0))
7687 if (InVec1 != Op1.getOperand(0))
7690 // Increment the number of extractions done.
7694 // Ensure we have found an opcode for both parities and that they are
7695 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7696 // inputs are undef.
7697 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7698 InVec0.isUndef() || InVec1.isUndef())
7701 IsSubAdd = Opc[0] == ISD::FADD;
7708 /// Returns true if is possible to fold MUL and an idiom that has already been
7709 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7710 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7711 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7713 /// Prior to calling this function it should be known that there is some
7714 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7715 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7716 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7717 /// of \p Opnd0 uses is expected to be equal to 2.
7718 /// For example, this function may be called for the following IR:
7719 /// %AB = fmul fast <2 x double> %A, %B
7720 /// %Sub = fsub fast <2 x double> %AB, %C
7721 /// %Add = fadd fast <2 x double> %AB, %C
7722 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7723 /// <2 x i32> <i32 0, i32 3>
7724 /// There is a def for %Addsub here, which potentially can be replaced by
7725 /// X86ISD::ADDSUB operation:
7726 /// %Addsub = X86ISD::ADDSUB %AB, %C
7727 /// and such ADDSUB can further be replaced with FMADDSUB:
7728 /// %Addsub = FMADDSUB %A, %B, %C.
7730 /// The main reason why this method is called before the replacement of the
7731 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7732 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7734 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7736 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7737 unsigned ExpectedUses) {
7738 if (Opnd0.getOpcode() != ISD::FMUL ||
7739 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7742 // FIXME: These checks must match the similar ones in
7743 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7744 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7745 // or MUL + ADDSUB to FMADDSUB.
7746 const TargetOptions &Options = DAG.getTarget().Options;
7748 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7753 Opnd1 = Opnd0.getOperand(1);
7754 Opnd0 = Opnd0.getOperand(0);
7759 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7760 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7761 /// X86ISD::FMSUBADD node.
7762 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7763 const X86Subtarget &Subtarget,
7764 SelectionDAG &DAG) {
7765 SDValue Opnd0, Opnd1;
7766 unsigned NumExtracts;
7768 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7772 MVT VT = BV->getSimpleValueType(0);
7775 // Try to generate X86ISD::FMADDSUB node here.
7777 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7778 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7779 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7782 // We only support ADDSUB.
7786 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7787 // the ADDSUB idiom has been successfully recognized. There are no known
7788 // X86 targets with 512-bit ADDSUB instructions!
7789 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7791 if (VT.is512BitVector())
7794 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7797 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7798 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7799 const X86Subtarget &Subtarget,
7800 SelectionDAG &DAG) {
7801 MVT VT = BV->getSimpleValueType(0);
7802 unsigned NumElts = VT.getVectorNumElements();
7803 unsigned NumUndefsLO = 0;
7804 unsigned NumUndefsHI = 0;
7805 unsigned Half = NumElts/2;
7807 // Count the number of UNDEF operands in the build_vector in input.
7808 for (unsigned i = 0, e = Half; i != e; ++i)
7809 if (BV->getOperand(i)->isUndef())
7812 for (unsigned i = Half, e = NumElts; i != e; ++i)
7813 if (BV->getOperand(i)->isUndef())
7816 // Early exit if this is either a build_vector of all UNDEFs or all the
7817 // operands but one are UNDEF.
7818 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7822 SDValue InVec0, InVec1;
7823 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7824 // Try to match an SSE3 float HADD/HSUB.
7825 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7826 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7828 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7829 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7830 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7831 // Try to match an SSSE3 integer HADD/HSUB.
7832 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7833 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7835 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7836 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7839 if (!Subtarget.hasAVX())
7842 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7843 // Try to match an AVX horizontal add/sub of packed single/double
7844 // precision floating point values from 256-bit vectors.
7845 SDValue InVec2, InVec3;
7846 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7847 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7848 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7849 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7850 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7852 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7853 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7854 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7855 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7856 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7857 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7858 // Try to match an AVX2 horizontal add/sub of signed integers.
7859 SDValue InVec2, InVec3;
7861 bool CanFold = true;
7863 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7864 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7865 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7866 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7867 X86Opcode = X86ISD::HADD;
7868 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7869 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7870 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7871 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7872 X86Opcode = X86ISD::HSUB;
7877 // Fold this build_vector into a single horizontal add/sub.
7878 // Do this only if the target has AVX2.
7879 if (Subtarget.hasAVX2())
7880 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7882 // Do not try to expand this build_vector into a pair of horizontal
7883 // add/sub if we can emit a pair of scalar add/sub.
7884 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7887 // Convert this build_vector into a pair of horizontal binop followed by
7889 bool isUndefLO = NumUndefsLO == Half;
7890 bool isUndefHI = NumUndefsHI == Half;
7891 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7892 isUndefLO, isUndefHI);
7896 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7897 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7899 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7900 X86Opcode = X86ISD::HADD;
7901 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7902 X86Opcode = X86ISD::HSUB;
7903 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7904 X86Opcode = X86ISD::FHADD;
7905 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7906 X86Opcode = X86ISD::FHSUB;
7910 // Don't try to expand this build_vector into a pair of horizontal add/sub
7911 // if we can simply emit a pair of scalar add/sub.
7912 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7915 // Convert this build_vector into two horizontal add/sub followed by
7917 bool isUndefLO = NumUndefsLO == Half;
7918 bool isUndefHI = NumUndefsHI == Half;
7919 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7920 isUndefLO, isUndefHI);
7926 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7927 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7928 /// just apply the bit to the vectors.
7929 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7930 /// from this, but enough scalar bit operations are created from the later
7931 /// legalization + scalarization stages to need basic support.
7932 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7933 SelectionDAG &DAG) {
7935 MVT VT = Op->getSimpleValueType(0);
7936 unsigned NumElems = VT.getVectorNumElements();
7937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7939 // Check that all elements have the same opcode.
7940 // TODO: Should we allow UNDEFS and if so how many?
7941 unsigned Opcode = Op->getOperand(0).getOpcode();
7942 for (unsigned i = 1; i < NumElems; ++i)
7943 if (Opcode != Op->getOperand(i).getOpcode())
7946 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7953 // Don't do this if the buildvector is a splat - we'd replace one
7954 // constant with an entire vector.
7955 if (Op->getSplatValue())
7957 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7962 SmallVector<SDValue, 4> LHSElts, RHSElts;
7963 for (SDValue Elt : Op->ops()) {
7964 SDValue LHS = Elt.getOperand(0);
7965 SDValue RHS = Elt.getOperand(1);
7967 // We expect the canonicalized RHS operand to be the constant.
7968 if (!isa<ConstantSDNode>(RHS))
7970 LHSElts.push_back(LHS);
7971 RHSElts.push_back(RHS);
7974 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7975 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7976 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7979 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7980 /// functionality to do this, so it's all zeros, all ones, or some derivation
7981 /// that is cheap to calculate.
7982 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7983 const X86Subtarget &Subtarget) {
7985 MVT VT = Op.getSimpleValueType();
7987 // Vectors containing all zeros can be matched by pxor and xorps.
7988 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7989 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7990 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7991 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7994 return getZeroVector(VT, Subtarget, DAG, DL);
7997 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7998 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7999 // vpcmpeqd on 256-bit vectors.
8000 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8001 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
8002 (VT == MVT::v8i32 && Subtarget.hasInt256()))
8005 return getOnesVector(VT, DAG, DL);
8011 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8012 /// from a vector of source values and a vector of extraction indices.
8013 /// The vectors might be manipulated to match the type of the permute op.
8014 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8015 SDLoc &DL, SelectionDAG &DAG,
8016 const X86Subtarget &Subtarget) {
8018 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8019 unsigned NumElts = VT.getVectorNumElements();
8020 unsigned SizeInBits = VT.getSizeInBits();
8022 // Adjust IndicesVec to match VT size.
8023 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8024 "Illegal variable permute mask size");
8025 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8026 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8027 NumElts * VT.getScalarSizeInBits());
8028 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8030 // Handle SrcVec that don't match VT type.
8031 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8032 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8033 // Handle larger SrcVec by treating it as a larger permute.
8034 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8035 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8036 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8037 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8038 Subtarget, DAG, SDLoc(IndicesVec));
8039 return extractSubVector(
8040 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8041 DAG, DL, SizeInBits);
8042 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8043 // Widen smaller SrcVec to match VT.
8044 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8049 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8050 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8051 EVT SrcVT = Idx.getValueType();
8052 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8053 uint64_t IndexScale = 0;
8054 uint64_t IndexOffset = 0;
8056 // If we're scaling a smaller permute op, then we need to repeat the
8057 // indices, scaling and offsetting them as well.
8058 // e.g. v4i32 -> v16i8 (Scale = 4)
8059 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8060 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8061 for (uint64_t i = 0; i != Scale; ++i) {
8062 IndexScale |= Scale << (i * NumDstBits);
8063 IndexOffset |= i << (i * NumDstBits);
8066 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8067 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8068 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8069 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8073 unsigned Opcode = 0;
8074 switch (VT.SimpleTy) {
8078 if (Subtarget.hasSSSE3())
8079 Opcode = X86ISD::PSHUFB;
8082 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8083 Opcode = X86ISD::VPERMV;
8084 else if (Subtarget.hasSSSE3()) {
8085 Opcode = X86ISD::PSHUFB;
8086 ShuffleVT = MVT::v16i8;
8091 if (Subtarget.hasAVX()) {
8092 Opcode = X86ISD::VPERMILPV;
8093 ShuffleVT = MVT::v4f32;
8094 } else if (Subtarget.hasSSSE3()) {
8095 Opcode = X86ISD::PSHUFB;
8096 ShuffleVT = MVT::v16i8;
8101 if (Subtarget.hasAVX()) {
8102 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8103 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8104 Opcode = X86ISD::VPERMILPV;
8105 ShuffleVT = MVT::v2f64;
8106 } else if (Subtarget.hasSSE41()) {
8107 // SSE41 can compare v2i64 - select between indices 0 and 1.
8108 return DAG.getSelectCC(
8110 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8111 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8112 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8113 ISD::CondCode::SETEQ);
8117 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8118 Opcode = X86ISD::VPERMV;
8119 else if (Subtarget.hasXOP()) {
8120 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8121 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8122 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8123 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8125 ISD::CONCAT_VECTORS, DL, VT,
8126 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8127 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8128 } else if (Subtarget.hasAVX()) {
8129 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8130 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8131 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8132 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8133 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8134 ArrayRef<SDValue> Ops) {
8135 // Permute Lo and Hi and then select based on index range.
8136 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8137 // care about the bit[7] as its just an index vector.
8138 SDValue Idx = Ops[2];
8139 EVT VT = Idx.getValueType();
8140 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8141 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8142 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8143 ISD::CondCode::SETGT);
8145 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8146 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8151 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8152 Opcode = X86ISD::VPERMV;
8153 else if (Subtarget.hasAVX()) {
8154 // Scale to v32i8 and perform as v32i8.
8155 IndicesVec = ScaleIndices(IndicesVec, 2);
8156 return DAG.getBitcast(
8157 VT, createVariablePermute(
8158 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8159 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8164 if (Subtarget.hasAVX2())
8165 Opcode = X86ISD::VPERMV;
8166 else if (Subtarget.hasAVX()) {
8167 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8168 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8169 {0, 1, 2, 3, 0, 1, 2, 3});
8170 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8171 {4, 5, 6, 7, 4, 5, 6, 7});
8172 if (Subtarget.hasXOP())
8173 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8174 LoLo, HiHi, IndicesVec,
8175 DAG.getConstant(0, DL, MVT::i8)));
8176 // Permute Lo and Hi and then select based on index range.
8177 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8178 SDValue Res = DAG.getSelectCC(
8179 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8180 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8181 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8182 ISD::CondCode::SETGT);
8183 return DAG.getBitcast(VT, Res);
8188 if (Subtarget.hasAVX512()) {
8189 if (!Subtarget.hasVLX()) {
8190 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8191 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8193 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8194 DAG, SDLoc(IndicesVec));
8195 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8197 return extract256BitVector(Res, 0, DAG, DL);
8199 Opcode = X86ISD::VPERMV;
8200 } else if (Subtarget.hasAVX()) {
8201 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8203 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8205 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8206 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8207 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8208 if (Subtarget.hasXOP())
8209 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8210 LoLo, HiHi, IndicesVec,
8211 DAG.getConstant(0, DL, MVT::i8)));
8212 // Permute Lo and Hi and then select based on index range.
8213 // This works as VPERMILPD only uses index bit[1] to permute elements.
8214 SDValue Res = DAG.getSelectCC(
8215 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8216 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8217 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8218 ISD::CondCode::SETGT);
8219 return DAG.getBitcast(VT, Res);
8223 if (Subtarget.hasVBMI())
8224 Opcode = X86ISD::VPERMV;
8227 if (Subtarget.hasBWI())
8228 Opcode = X86ISD::VPERMV;
8234 if (Subtarget.hasAVX512())
8235 Opcode = X86ISD::VPERMV;
8241 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8242 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8243 "Illegal variable permute shuffle type");
8245 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8247 IndicesVec = ScaleIndices(IndicesVec, Scale);
8249 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8250 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8252 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8253 SDValue Res = Opcode == X86ISD::VPERMV
8254 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8255 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8256 return DAG.getBitcast(VT, Res);
8259 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8260 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8261 // (build_vector (extract_elt V, (extract_elt I, 0)),
8262 // (extract_elt V, (extract_elt I, 1)),
8267 // TODO: Handle undefs
8268 // TODO: Utilize pshufb and zero mask blending to support more efficient
8269 // construction of vectors with constant-0 elements.
8271 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8272 const X86Subtarget &Subtarget) {
8273 SDValue SrcVec, IndicesVec;
8274 // Check for a match of the permute source vector and permute index elements.
8275 // This is done by checking that the i-th build_vector operand is of the form:
8276 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8277 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8278 SDValue Op = V.getOperand(Idx);
8279 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8282 // If this is the first extract encountered in V, set the source vector,
8283 // otherwise verify the extract is from the previously defined source
8286 SrcVec = Op.getOperand(0);
8287 else if (SrcVec != Op.getOperand(0))
8289 SDValue ExtractedIndex = Op->getOperand(1);
8290 // Peek through extends.
8291 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8292 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8293 ExtractedIndex = ExtractedIndex.getOperand(0);
8294 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8297 // If this is the first extract from the index vector candidate, set the
8298 // indices vector, otherwise verify the extract is from the previously
8299 // defined indices vector.
8301 IndicesVec = ExtractedIndex.getOperand(0);
8302 else if (IndicesVec != ExtractedIndex.getOperand(0))
8305 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8306 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8311 MVT VT = V.getSimpleValueType();
8312 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8316 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8319 MVT VT = Op.getSimpleValueType();
8320 MVT EltVT = VT.getVectorElementType();
8321 unsigned NumElems = Op.getNumOperands();
8323 // Generate vectors for predicate vectors.
8324 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8325 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8327 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8328 return VectorConstant;
8330 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8331 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8333 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8334 return HorizontalOp;
8335 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8337 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8340 unsigned EVTBits = EltVT.getSizeInBits();
8342 unsigned NumZero = 0;
8343 unsigned NumNonZero = 0;
8344 uint64_t NonZeros = 0;
8345 bool IsAllConstants = true;
8346 SmallSet<SDValue, 8> Values;
8347 unsigned NumConstants = NumElems;
8348 for (unsigned i = 0; i < NumElems; ++i) {
8349 SDValue Elt = Op.getOperand(i);
8353 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8354 IsAllConstants = false;
8357 if (X86::isZeroNode(Elt))
8360 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8361 NonZeros |= ((uint64_t)1 << i);
8366 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8367 if (NumNonZero == 0)
8368 return DAG.getUNDEF(VT);
8370 // If we are inserting one variable into a vector of non-zero constants, try
8371 // to avoid loading each constant element as a scalar. Load the constants as a
8372 // vector and then insert the variable scalar element. If insertion is not
8373 // supported, we assume that we will fall back to a shuffle to get the scalar
8374 // blended with the constants. Insertion into a zero vector is handled as a
8375 // special-case somewhere below here.
8376 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8377 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8378 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8379 // Create an all-constant vector. The variable element in the old
8380 // build vector is replaced by undef in the constant vector. Save the
8381 // variable scalar element and its index for use in the insertelement.
8382 LLVMContext &Context = *DAG.getContext();
8383 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8384 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8387 for (unsigned i = 0; i != NumElems; ++i) {
8388 SDValue Elt = Op.getOperand(i);
8389 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8390 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8391 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8392 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8393 else if (!Elt.isUndef()) {
8394 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8395 "Expected one variable element in this vector");
8397 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8400 Constant *CV = ConstantVector::get(ConstVecOps);
8401 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8403 // The constants we just created may not be legal (eg, floating point). We
8404 // must lower the vector right here because we can not guarantee that we'll
8405 // legalize it before loading it. This is also why we could not just create
8406 // a new build vector here. If the build vector contains illegal constants,
8407 // it could get split back up into a series of insert elements.
8408 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8409 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8410 MachineFunction &MF = DAG.getMachineFunction();
8411 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8412 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8413 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8416 // Special case for single non-zero, non-undef, element.
8417 if (NumNonZero == 1) {
8418 unsigned Idx = countTrailingZeros(NonZeros);
8419 SDValue Item = Op.getOperand(Idx);
8421 // If we have a constant or non-constant insertion into the low element of
8422 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8423 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8424 // depending on what the source datatype is.
8427 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8429 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8430 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8431 assert((VT.is128BitVector() || VT.is256BitVector() ||
8432 VT.is512BitVector()) &&
8433 "Expected an SSE value type!");
8434 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8435 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8436 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8439 // We can't directly insert an i8 or i16 into a vector, so zero extend
8441 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8442 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8443 if (VT.getSizeInBits() >= 256) {
8444 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8445 if (Subtarget.hasAVX()) {
8446 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8447 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8449 // Without AVX, we need to extend to a 128-bit vector and then
8450 // insert into the 256-bit vector.
8451 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8452 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8453 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8456 assert(VT.is128BitVector() && "Expected an SSE value type!");
8457 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8458 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8460 return DAG.getBitcast(VT, Item);
8464 // Is it a vector logical left shift?
8465 if (NumElems == 2 && Idx == 1 &&
8466 X86::isZeroNode(Op.getOperand(0)) &&
8467 !X86::isZeroNode(Op.getOperand(1))) {
8468 unsigned NumBits = VT.getSizeInBits();
8469 return getVShift(true, VT,
8470 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8471 VT, Op.getOperand(1)),
8472 NumBits/2, DAG, *this, dl);
8475 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8478 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8479 // is a non-constant being inserted into an element other than the low one,
8480 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8481 // movd/movss) to move this into the low element, then shuffle it into
8483 if (EVTBits == 32) {
8484 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8485 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8489 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8490 if (Values.size() == 1) {
8491 if (EVTBits == 32) {
8492 // Instead of a shuffle like this:
8493 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8494 // Check if it's possible to issue this instead.
8495 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8496 unsigned Idx = countTrailingZeros(NonZeros);
8497 SDValue Item = Op.getOperand(Idx);
8498 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8499 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8504 // A vector full of immediates; various special cases are already
8505 // handled, so this is best done with a single constant-pool load.
8509 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8512 // See if we can use a vector load to get all of the elements.
8514 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8516 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8520 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8521 // build_vector and broadcast it.
8522 // TODO: We could probably generalize this more.
8523 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8524 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8525 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8526 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8527 // Make sure all the even/odd operands match.
8528 for (unsigned i = 2; i != NumElems; ++i)
8529 if (Ops[i % 2] != Op.getOperand(i))
8533 if (CanSplat(Op, NumElems, Ops)) {
8534 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8535 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8536 // Create a new build vector and cast to v2i64/v2f64.
8537 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8538 DAG.getBuildVector(NarrowVT, dl, Ops));
8539 // Broadcast from v2i64/v2f64 and cast to final VT.
8540 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8541 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8546 // For AVX-length vectors, build the individual 128-bit pieces and use
8547 // shuffles to put them in place.
8548 if (VT.getSizeInBits() > 128) {
8549 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8551 // Build both the lower and upper subvector.
8553 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8554 SDValue Upper = DAG.getBuildVector(
8555 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8557 // Recreate the wider vector with the lower and upper part.
8558 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8559 VT.getSizeInBits() / 2);
8562 // Let legalizer expand 2-wide build_vectors.
8563 if (EVTBits == 64) {
8564 if (NumNonZero == 1) {
8565 // One half is zero or undef.
8566 unsigned Idx = countTrailingZeros(NonZeros);
8567 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8568 Op.getOperand(Idx));
8569 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8574 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8575 if (EVTBits == 8 && NumElems == 16)
8576 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8580 if (EVTBits == 16 && NumElems == 8)
8581 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8585 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8586 if (EVTBits == 32 && NumElems == 4)
8587 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8590 // If element VT is == 32 bits, turn it into a number of shuffles.
8591 if (NumElems == 4 && NumZero > 0) {
8592 SmallVector<SDValue, 8> Ops(NumElems);
8593 for (unsigned i = 0; i < 4; ++i) {
8594 bool isZero = !(NonZeros & (1ULL << i));
8596 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8598 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8601 for (unsigned i = 0; i < 2; ++i) {
8602 switch ((NonZeros >> (i*2)) & 0x3) {
8603 default: llvm_unreachable("Unexpected NonZero count");
8605 Ops[i] = Ops[i*2]; // Must be a zero vector.
8608 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8611 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8614 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8619 bool Reverse1 = (NonZeros & 0x3) == 2;
8620 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8624 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8625 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8627 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8630 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8632 // Check for a build vector from mostly shuffle plus few inserting.
8633 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8636 // For SSE 4.1, use insertps to put the high elements into the low element.
8637 if (Subtarget.hasSSE41()) {
8639 if (!Op.getOperand(0).isUndef())
8640 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8642 Result = DAG.getUNDEF(VT);
8644 for (unsigned i = 1; i < NumElems; ++i) {
8645 if (Op.getOperand(i).isUndef()) continue;
8646 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8647 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8652 // Otherwise, expand into a number of unpckl*, start by extending each of
8653 // our (non-undef) elements to the full vector width with the element in the
8654 // bottom slot of the vector (which generates no code for SSE).
8655 SmallVector<SDValue, 8> Ops(NumElems);
8656 for (unsigned i = 0; i < NumElems; ++i) {
8657 if (!Op.getOperand(i).isUndef())
8658 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8660 Ops[i] = DAG.getUNDEF(VT);
8663 // Next, we iteratively mix elements, e.g. for v4f32:
8664 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8665 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8666 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8667 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8668 // Generate scaled UNPCKL shuffle mask.
8669 SmallVector<int, 16> Mask;
8670 for(unsigned i = 0; i != Scale; ++i)
8672 for (unsigned i = 0; i != Scale; ++i)
8673 Mask.push_back(NumElems+i);
8674 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8676 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8677 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8682 // 256-bit AVX can use the vinsertf128 instruction
8683 // to create 256-bit vectors from two other 128-bit ones.
8684 // TODO: Detect subvector broadcast here instead of DAG combine?
8685 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8686 const X86Subtarget &Subtarget) {
8688 MVT ResVT = Op.getSimpleValueType();
8690 assert((ResVT.is256BitVector() ||
8691 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8693 unsigned NumOperands = Op.getNumOperands();
8694 unsigned NumZero = 0;
8695 unsigned NumNonZero = 0;
8696 unsigned NonZeros = 0;
8697 for (unsigned i = 0; i != NumOperands; ++i) {
8698 SDValue SubVec = Op.getOperand(i);
8699 if (SubVec.isUndef())
8701 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8704 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8710 // If we have more than 2 non-zeros, build each half separately.
8711 if (NumNonZero > 2) {
8712 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8713 ResVT.getVectorNumElements()/2);
8714 ArrayRef<SDUse> Ops = Op->ops();
8715 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8716 Ops.slice(0, NumOperands/2));
8717 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8718 Ops.slice(NumOperands/2));
8719 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8722 // Otherwise, build it up through insert_subvectors.
8723 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8724 : DAG.getUNDEF(ResVT);
8726 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8727 unsigned NumSubElems = SubVT.getVectorNumElements();
8728 for (unsigned i = 0; i != NumOperands; ++i) {
8729 if ((NonZeros & (1 << i)) == 0)
8732 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8734 DAG.getIntPtrConstant(i * NumSubElems, dl));
8740 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8741 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8742 static bool isExpandWithZeros(const SDValue &Op) {
8743 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8744 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8746 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8747 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8753 // Returns true if the given node is a type promotion (by concatenating i1
8754 // zeros) of the result of a node that already zeros all upper bits of
8756 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8757 unsigned Opc = Op.getOpcode();
8759 assert(Opc == ISD::CONCAT_VECTORS &&
8760 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8761 "Unexpected node to check for type promotion!");
8763 // As long as we are concatenating zeros to the upper part of a previous node
8764 // result, climb up the tree until a node with different opcode is
8766 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8767 if (Opc == ISD::INSERT_SUBVECTOR) {
8768 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8769 Op.getConstantOperandVal(2) == 0)
8770 Op = Op.getOperand(1);
8773 } else { // Opc == ISD::CONCAT_VECTORS
8774 if (isExpandWithZeros(Op))
8775 Op = Op.getOperand(0);
8779 Opc = Op.getOpcode();
8782 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8783 // of a node that zeros the upper bits (its masked version).
8784 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8785 (Op.getOpcode() == ISD::AND &&
8786 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8787 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8794 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8795 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8796 const X86Subtarget &Subtarget,
8797 SelectionDAG & DAG) {
8799 MVT ResVT = Op.getSimpleValueType();
8800 unsigned NumOperands = Op.getNumOperands();
8802 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8803 "Unexpected number of operands in CONCAT_VECTORS");
8805 // If this node promotes - by concatenating zeroes - the type of the result
8806 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8807 // output register, mark it as legal and catch the pattern in instruction
8808 // selection to avoid emitting extra instructions (for zeroing upper bits).
8809 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8810 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8812 unsigned NumZero = 0;
8813 unsigned NumNonZero = 0;
8814 uint64_t NonZeros = 0;
8815 for (unsigned i = 0; i != NumOperands; ++i) {
8816 SDValue SubVec = Op.getOperand(i);
8817 if (SubVec.isUndef())
8819 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8822 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8823 NonZeros |= (uint64_t)1 << i;
8829 // If there are zero or one non-zeros we can handle this very simply.
8830 if (NumNonZero <= 1) {
8831 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8832 : DAG.getUNDEF(ResVT);
8835 unsigned Idx = countTrailingZeros(NonZeros);
8836 SDValue SubVec = Op.getOperand(Idx);
8837 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8838 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8839 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8842 if (NumOperands > 2) {
8843 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8844 ResVT.getVectorNumElements()/2);
8845 ArrayRef<SDUse> Ops = Op->ops();
8846 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8847 Ops.slice(0, NumOperands/2));
8848 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8849 Ops.slice(NumOperands/2));
8850 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8853 assert(NumNonZero == 2 && "Simple cases not handled?");
8855 if (ResVT.getVectorNumElements() >= 16)
8856 return Op; // The operation is legal with KUNPCK
8858 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8859 DAG.getUNDEF(ResVT), Op.getOperand(0),
8860 DAG.getIntPtrConstant(0, dl));
8861 unsigned NumElems = ResVT.getVectorNumElements();
8862 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8863 DAG.getIntPtrConstant(NumElems/2, dl));
8866 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8867 const X86Subtarget &Subtarget,
8868 SelectionDAG &DAG) {
8869 MVT VT = Op.getSimpleValueType();
8870 if (VT.getVectorElementType() == MVT::i1)
8871 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8873 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8874 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8875 Op.getNumOperands() == 4)));
8877 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8878 // from two other 128-bit ones.
8880 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8881 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8884 //===----------------------------------------------------------------------===//
8885 // Vector shuffle lowering
8887 // This is an experimental code path for lowering vector shuffles on x86. It is
8888 // designed to handle arbitrary vector shuffles and blends, gracefully
8889 // degrading performance as necessary. It works hard to recognize idiomatic
8890 // shuffles and lower them to optimal instruction patterns without leaving
8891 // a framework that allows reasonably efficient handling of all vector shuffle
8893 //===----------------------------------------------------------------------===//
8895 /// Tiny helper function to identify a no-op mask.
8897 /// This is a somewhat boring predicate function. It checks whether the mask
8898 /// array input, which is assumed to be a single-input shuffle mask of the kind
8899 /// used by the X86 shuffle instructions (not a fully general
8900 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8901 /// in-place shuffle are 'no-op's.
8902 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8903 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8904 assert(Mask[i] >= -1 && "Out of bound mask element!");
8905 if (Mask[i] >= 0 && Mask[i] != i)
8911 /// Test whether there are elements crossing 128-bit lanes in this
8914 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8915 /// and we routinely test for these.
8916 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8917 int LaneSize = 128 / VT.getScalarSizeInBits();
8918 int Size = Mask.size();
8919 for (int i = 0; i < Size; ++i)
8920 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8925 /// Test whether a shuffle mask is equivalent within each sub-lane.
8927 /// This checks a shuffle mask to see if it is performing the same
8928 /// lane-relative shuffle in each sub-lane. This trivially implies
8929 /// that it is also not lane-crossing. It may however involve a blend from the
8930 /// same lane of a second vector.
8932 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8933 /// non-trivial to compute in the face of undef lanes. The representation is
8934 /// suitable for use with existing 128-bit shuffles as entries from the second
8935 /// vector have been remapped to [LaneSize, 2*LaneSize).
8936 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8938 SmallVectorImpl<int> &RepeatedMask) {
8939 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8940 RepeatedMask.assign(LaneSize, -1);
8941 int Size = Mask.size();
8942 for (int i = 0; i < Size; ++i) {
8943 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8946 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8947 // This entry crosses lanes, so there is no way to model this shuffle.
8950 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8951 // Adjust second vector indices to start at LaneSize instead of Size.
8952 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8953 : Mask[i] % LaneSize + LaneSize;
8954 if (RepeatedMask[i % LaneSize] < 0)
8955 // This is the first non-undef entry in this slot of a 128-bit lane.
8956 RepeatedMask[i % LaneSize] = LocalM;
8957 else if (RepeatedMask[i % LaneSize] != LocalM)
8958 // Found a mismatch with the repeated mask.
8964 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8966 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8967 SmallVectorImpl<int> &RepeatedMask) {
8968 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8972 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
8973 SmallVector<int, 32> RepeatedMask;
8974 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8977 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8979 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8980 SmallVectorImpl<int> &RepeatedMask) {
8981 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8984 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8985 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8986 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8988 SmallVectorImpl<int> &RepeatedMask) {
8989 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8990 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8991 int Size = Mask.size();
8992 for (int i = 0; i < Size; ++i) {
8993 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8994 if (Mask[i] == SM_SentinelUndef)
8996 if (Mask[i] == SM_SentinelZero) {
8997 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8999 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9002 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9003 // This entry crosses lanes, so there is no way to model this shuffle.
9006 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9007 // Adjust second vector indices to start at LaneSize instead of Size.
9009 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
9010 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9011 // This is the first non-undef entry in this slot of a 128-bit lane.
9012 RepeatedMask[i % LaneSize] = LocalM;
9013 else if (RepeatedMask[i % LaneSize] != LocalM)
9014 // Found a mismatch with the repeated mask.
9020 /// Checks whether a shuffle mask is equivalent to an explicit list of
9023 /// This is a fast way to test a shuffle mask against a fixed pattern:
9025 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9027 /// It returns true if the mask is exactly as wide as the argument list, and
9028 /// each element of the mask is either -1 (signifying undef) or the value given
9029 /// in the argument.
9030 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
9031 ArrayRef<int> ExpectedMask) {
9032 if (Mask.size() != ExpectedMask.size())
9035 int Size = Mask.size();
9037 // If the values are build vectors, we can look through them to find
9038 // equivalent inputs that make the shuffles equivalent.
9039 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
9040 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
9042 for (int i = 0; i < Size; ++i) {
9043 assert(Mask[i] >= -1 && "Out of bound mask element!");
9044 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
9045 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9046 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9047 if (!MaskBV || !ExpectedBV ||
9048 MaskBV->getOperand(Mask[i] % Size) !=
9049 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9057 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9059 /// The masks must be exactly the same width.
9061 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9062 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9064 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9065 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9066 ArrayRef<int> ExpectedMask) {
9067 int Size = Mask.size();
9068 if (Size != (int)ExpectedMask.size())
9071 for (int i = 0; i < Size; ++i)
9072 if (Mask[i] == SM_SentinelUndef)
9074 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9076 else if (Mask[i] != ExpectedMask[i])
9082 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9084 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9085 const APInt &Zeroable) {
9086 int NumElts = Mask.size();
9087 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9089 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9090 for (int i = 0; i != NumElts; ++i) {
9092 if (M == SM_SentinelUndef)
9094 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9095 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9100 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9102 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9103 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9106 SmallVector<int, 8> Unpcklwd;
9107 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9108 /* Unary = */ false);
9109 SmallVector<int, 8> Unpckhwd;
9110 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9111 /* Unary = */ false);
9112 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9113 isTargetShuffleEquivalent(Mask, Unpckhwd));
9114 return IsUnpackwdMask;
9117 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9119 /// This helper function produces an 8-bit shuffle immediate corresponding to
9120 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9121 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9124 /// NB: We rely heavily on "undef" masks preserving the input lane.
9125 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9126 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9127 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9128 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9129 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9130 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9133 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9134 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9135 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9136 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9140 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9141 SelectionDAG &DAG) {
9142 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9145 /// Compute whether each element of a shuffle is zeroable.
9147 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9148 /// Either it is an undef element in the shuffle mask, the element of the input
9149 /// referenced is undef, or the element of the input referenced is known to be
9150 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9151 /// as many lanes with this technique as possible to simplify the remaining
9153 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9154 SDValue V1, SDValue V2) {
9155 APInt Zeroable(Mask.size(), 0);
9156 V1 = peekThroughBitcasts(V1);
9157 V2 = peekThroughBitcasts(V2);
9159 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9160 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9162 int VectorSizeInBits = V1.getValueSizeInBits();
9163 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9164 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9166 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9168 // Handle the easy cases.
9169 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9174 // Determine shuffle input and normalize the mask.
9175 SDValue V = M < Size ? V1 : V2;
9178 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9179 if (V.getOpcode() != ISD::BUILD_VECTOR)
9182 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9183 // the (larger) source element must be UNDEF/ZERO.
9184 if ((Size % V.getNumOperands()) == 0) {
9185 int Scale = Size / V->getNumOperands();
9186 SDValue Op = V.getOperand(M / Scale);
9187 if (Op.isUndef() || X86::isZeroNode(Op))
9189 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9190 APInt Val = Cst->getAPIntValue();
9191 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9192 Val = Val.getLoBits(ScalarSizeInBits);
9195 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9196 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9197 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9198 Val = Val.getLoBits(ScalarSizeInBits);
9205 // If the BUILD_VECTOR has more elements then all the (smaller) source
9206 // elements must be UNDEF or ZERO.
9207 if ((V.getNumOperands() % Size) == 0) {
9208 int Scale = V->getNumOperands() / Size;
9209 bool AllZeroable = true;
9210 for (int j = 0; j < Scale; ++j) {
9211 SDValue Op = V.getOperand((M * Scale) + j);
9212 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9223 // The Shuffle result is as follow:
9224 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9225 // Each Zeroable's element correspond to a particular Mask's element.
9226 // As described in computeZeroableShuffleElements function.
9228 // The function looks for a sub-mask that the nonzero elements are in
9229 // increasing order. If such sub-mask exist. The function returns true.
9230 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9231 ArrayRef<int> Mask, const EVT &VectorType,
9232 bool &IsZeroSideLeft) {
9233 int NextElement = -1;
9234 // Check if the Mask's nonzero elements are in increasing order.
9235 for (int i = 0, e = Mask.size(); i < e; i++) {
9236 // Checks if the mask's zeros elements are built from only zeros.
9237 assert(Mask[i] >= -1 && "Out of bound mask element!");
9242 // Find the lowest non zero element
9243 if (NextElement < 0) {
9244 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9245 IsZeroSideLeft = NextElement != 0;
9247 // Exit if the mask's non zero elements are not in increasing order.
9248 if (NextElement != Mask[i])
9255 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9256 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9257 ArrayRef<int> Mask, SDValue V1,
9259 const APInt &Zeroable,
9260 const X86Subtarget &Subtarget,
9261 SelectionDAG &DAG) {
9262 int Size = Mask.size();
9263 int LaneSize = 128 / VT.getScalarSizeInBits();
9264 const int NumBytes = VT.getSizeInBits() / 8;
9265 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9267 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9268 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9269 (Subtarget.hasBWI() && VT.is512BitVector()));
9271 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9272 // Sign bit set in i8 mask means zero element.
9273 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9276 for (int i = 0; i < NumBytes; ++i) {
9277 int M = Mask[i / NumEltBytes];
9279 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9282 if (Zeroable[i / NumEltBytes]) {
9283 PSHUFBMask[i] = ZeroMask;
9287 // We can only use a single input of V1 or V2.
9288 SDValue SrcV = (M >= Size ? V2 : V1);
9294 // PSHUFB can't cross lanes, ensure this doesn't happen.
9295 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9299 M = M * NumEltBytes + (i % NumEltBytes);
9300 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9302 assert(V && "Failed to find a source input");
9304 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9305 return DAG.getBitcast(
9306 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9307 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9310 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9311 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9314 // X86 has dedicated shuffle that can be lowered to VEXPAND
9315 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9316 const APInt &Zeroable,
9317 ArrayRef<int> Mask, SDValue &V1,
9318 SDValue &V2, SelectionDAG &DAG,
9319 const X86Subtarget &Subtarget) {
9320 bool IsLeftZeroSide = true;
9321 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9324 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9326 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9327 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9328 unsigned NumElts = VT.getVectorNumElements();
9329 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9330 "Unexpected number of vector elements");
9331 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9332 Subtarget, DAG, DL);
9333 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9334 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9335 return DAG.getSelect(DL, VT, VMask,
9336 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9340 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9341 unsigned &UnpackOpcode, bool IsUnary,
9342 ArrayRef<int> TargetMask,
9343 const SDLoc &DL, SelectionDAG &DAG,
9344 const X86Subtarget &Subtarget) {
9345 int NumElts = VT.getVectorNumElements();
9347 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9348 for (int i = 0; i != NumElts; i += 2) {
9349 int M1 = TargetMask[i + 0];
9350 int M2 = TargetMask[i + 1];
9351 Undef1 &= (SM_SentinelUndef == M1);
9352 Undef2 &= (SM_SentinelUndef == M2);
9353 Zero1 &= isUndefOrZero(M1);
9354 Zero2 &= isUndefOrZero(M2);
9356 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9357 "Zeroable shuffle detected");
9359 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9360 SmallVector<int, 64> Unpckl, Unpckh;
9361 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9362 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9363 UnpackOpcode = X86ISD::UNPCKL;
9364 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9365 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9369 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9370 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9371 UnpackOpcode = X86ISD::UNPCKH;
9372 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9373 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9377 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9378 if (IsUnary && (Zero1 || Zero2)) {
9379 // Don't bother if we can blend instead.
9380 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9381 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9384 bool MatchLo = true, MatchHi = true;
9385 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9386 int M = TargetMask[i];
9388 // Ignore if the input is known to be zero or the index is undef.
9389 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9390 (M == SM_SentinelUndef))
9393 MatchLo &= (M == Unpckl[i]);
9394 MatchHi &= (M == Unpckh[i]);
9397 if (MatchLo || MatchHi) {
9398 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9399 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9400 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9405 // If a binary shuffle, commute and try again.
9407 ShuffleVectorSDNode::commuteMask(Unpckl);
9408 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9409 UnpackOpcode = X86ISD::UNPCKL;
9414 ShuffleVectorSDNode::commuteMask(Unpckh);
9415 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9416 UnpackOpcode = X86ISD::UNPCKH;
9425 // X86 has dedicated unpack instructions that can handle specific blend
9426 // operations: UNPCKH and UNPCKL.
9427 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9428 ArrayRef<int> Mask, SDValue V1,
9429 SDValue V2, SelectionDAG &DAG) {
9430 SmallVector<int, 8> Unpckl;
9431 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9432 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9433 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9435 SmallVector<int, 8> Unpckh;
9436 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9437 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9438 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9440 // Commute and try again.
9441 ShuffleVectorSDNode::commuteMask(Unpckl);
9442 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9443 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9445 ShuffleVectorSDNode::commuteMask(Unpckh);
9446 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9447 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9452 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
9454 int Size = (int)Mask.size();
9455 int Split = Size / Delta;
9456 int TruncatedVectorStart = SwappedOps ? Size : 0;
9458 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
9459 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
9462 // The rest of the mask should not refer to the truncated vector's elements.
9463 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
9464 TruncatedVectorStart + Size))
9470 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9472 // An example is the following:
9474 // t0: ch = EntryToken
9475 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9476 // t25: v4i32 = truncate t2
9477 // t41: v8i16 = bitcast t25
9478 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9479 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9480 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9481 // t18: v2i64 = bitcast t51
9483 // Without avx512vl, this is lowered to:
9485 // vpmovqd %zmm0, %ymm0
9486 // vpshufb {{.*#+}} xmm0 =
9487 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
9489 // But when avx512vl is available, one can just use a single vpmovdw
9491 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
9492 MVT VT, SDValue V1, SDValue V2,
9494 const X86Subtarget &Subtarget) {
9495 if (VT != MVT::v16i8 && VT != MVT::v8i16)
9498 if (Mask.size() != VT.getVectorNumElements())
9501 bool SwappedOps = false;
9503 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
9504 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
9513 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
9514 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
9516 // and similar ones.
9517 if (V1.getOpcode() != ISD::BITCAST)
9519 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
9522 SDValue Src = V1.getOperand(0).getOperand(0);
9523 MVT SrcVT = Src.getSimpleValueType();
9525 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
9526 // are only available with avx512vl.
9527 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
9530 // Down Convert Word to Byte is only available with avx512bw. The case with
9531 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
9532 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
9533 !Subtarget.hasBWI())
9536 // The first half/quarter of the mask should refer to every second/fourth
9537 // element of the vector truncated and bitcasted.
9538 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
9539 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
9542 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
9545 // X86 has dedicated pack instructions that can handle specific truncation
9546 // operations: PACKSS and PACKUS.
9547 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9548 SDValue &V2, unsigned &PackOpcode,
9549 ArrayRef<int> TargetMask,
9551 const X86Subtarget &Subtarget) {
9552 unsigned NumElts = VT.getVectorNumElements();
9553 unsigned BitSize = VT.getScalarSizeInBits();
9554 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9555 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9557 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9558 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9559 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9560 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9561 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9562 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9563 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9567 PackOpcode = X86ISD::PACKUS;
9571 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9572 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9576 PackOpcode = X86ISD::PACKSS;
9582 // Try binary shuffle.
9583 SmallVector<int, 32> BinaryMask;
9584 createPackShuffleMask(VT, BinaryMask, false);
9585 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9586 if (MatchPACK(V1, V2))
9589 // Try unary shuffle.
9590 SmallVector<int, 32> UnaryMask;
9591 createPackShuffleMask(VT, UnaryMask, true);
9592 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9593 if (MatchPACK(V1, V1))
9599 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9600 ArrayRef<int> Mask, SDValue V1,
9601 SDValue V2, SelectionDAG &DAG,
9602 const X86Subtarget &Subtarget) {
9604 unsigned PackOpcode;
9605 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9607 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9608 DAG.getBitcast(PackVT, V2));
9613 /// Try to emit a bitmask instruction for a shuffle.
9615 /// This handles cases where we can model a blend exactly as a bitmask due to
9616 /// one of the inputs being zeroable.
9617 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9618 SDValue V2, ArrayRef<int> Mask,
9619 const APInt &Zeroable,
9620 SelectionDAG &DAG) {
9621 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9622 MVT EltVT = VT.getVectorElementType();
9623 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9624 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9625 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9627 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9630 if (Mask[i] % Size != i)
9631 return SDValue(); // Not a blend.
9633 V = Mask[i] < Size ? V1 : V2;
9634 else if (V != (Mask[i] < Size ? V1 : V2))
9635 return SDValue(); // Can only let one input through the mask.
9637 VMaskOps[i] = AllOnes;
9640 return SDValue(); // No non-zeroable elements!
9642 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9643 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9646 /// Try to emit a blend instruction for a shuffle using bit math.
9648 /// This is used as a fallback approach when first class blend instructions are
9649 /// unavailable. Currently it is only suitable for integer vectors, but could
9650 /// be generalized for floating point vectors if desirable.
9651 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9652 SDValue V2, ArrayRef<int> Mask,
9653 SelectionDAG &DAG) {
9654 assert(VT.isInteger() && "Only supports integer vector types!");
9655 MVT EltVT = VT.getVectorElementType();
9656 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9657 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9658 SmallVector<SDValue, 16> MaskOps;
9659 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9660 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9661 return SDValue(); // Shuffled input!
9662 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9665 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9666 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9667 // We have to cast V2 around.
9668 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9669 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9670 DAG.getBitcast(MaskVT, V1Mask),
9671 DAG.getBitcast(MaskVT, V2)));
9672 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9675 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9676 SDValue PreservedSrc,
9677 const X86Subtarget &Subtarget,
9680 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9681 MutableArrayRef<int> TargetMask,
9682 bool &ForceV1Zero, bool &ForceV2Zero,
9683 uint64_t &BlendMask) {
9684 bool V1IsZeroOrUndef =
9685 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9686 bool V2IsZeroOrUndef =
9687 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9690 ForceV1Zero = false, ForceV2Zero = false;
9691 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9693 // Attempt to generate the binary blend mask. If an input is zero then
9694 // we can use any lane.
9695 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9696 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9697 int M = TargetMask[i];
9698 if (M == SM_SentinelUndef)
9702 if (M == i + Size) {
9703 BlendMask |= 1ull << i;
9706 if (M == SM_SentinelZero) {
9707 if (V1IsZeroOrUndef) {
9712 if (V2IsZeroOrUndef) {
9714 BlendMask |= 1ull << i;
9715 TargetMask[i] = i + Size;
9724 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9726 uint64_t ScaledMask = 0;
9727 for (int i = 0; i != Size; ++i)
9728 if (BlendMask & (1ull << i))
9729 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9733 /// Try to emit a blend instruction for a shuffle.
9735 /// This doesn't do any checks for the availability of instructions for blending
9736 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9737 /// be matched in the backend with the type given. What it does check for is
9738 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9739 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9740 SDValue V2, ArrayRef<int> Original,
9741 const APInt &Zeroable,
9742 const X86Subtarget &Subtarget,
9743 SelectionDAG &DAG) {
9744 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9746 uint64_t BlendMask = 0;
9747 bool ForceV1Zero = false, ForceV2Zero = false;
9748 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9752 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9754 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9756 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9758 switch (VT.SimpleTy) {
9763 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9764 DAG.getConstant(BlendMask, DL, MVT::i8));
9768 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9772 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9773 // that instruction.
9774 if (Subtarget.hasAVX2()) {
9775 // Scale the blend by the number of 32-bit dwords per element.
9776 int Scale = VT.getScalarSizeInBits() / 32;
9777 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9778 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9779 V1 = DAG.getBitcast(BlendVT, V1);
9780 V2 = DAG.getBitcast(BlendVT, V2);
9781 return DAG.getBitcast(
9782 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9783 DAG.getConstant(BlendMask, DL, MVT::i8)));
9787 // For integer shuffles we need to expand the mask and cast the inputs to
9788 // v8i16s prior to blending.
9789 int Scale = 8 / VT.getVectorNumElements();
9790 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9791 V1 = DAG.getBitcast(MVT::v8i16, V1);
9792 V2 = DAG.getBitcast(MVT::v8i16, V2);
9793 return DAG.getBitcast(VT,
9794 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9795 DAG.getConstant(BlendMask, DL, MVT::i8)));
9799 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9800 SmallVector<int, 8> RepeatedMask;
9801 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9802 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9803 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9805 for (int i = 0; i < 8; ++i)
9806 if (RepeatedMask[i] >= 8)
9807 BlendMask |= 1ull << i;
9808 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9809 DAG.getConstant(BlendMask, DL, MVT::i8));
9815 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9816 "256-bit byte-blends require AVX2 support!");
9818 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9820 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9821 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9822 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9825 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9826 if (SDValue Masked =
9827 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9830 // Scale the blend by the number of bytes per element.
9831 int Scale = VT.getScalarSizeInBits() / 8;
9833 // This form of blend is always done on bytes. Compute the byte vector
9835 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9837 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9838 // mix of LLVM's code generator and the x86 backend. We tell the code
9839 // generator that boolean values in the elements of an x86 vector register
9840 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9841 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9842 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9843 // of the element (the remaining are ignored) and 0 in that high bit would
9844 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9845 // the LLVM model for boolean values in vector elements gets the relevant
9846 // bit set, it is set backwards and over constrained relative to x86's
9848 SmallVector<SDValue, 32> VSELECTMask;
9849 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9850 for (int j = 0; j < Scale; ++j)
9851 VSELECTMask.push_back(
9852 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9853 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9856 V1 = DAG.getBitcast(BlendVT, V1);
9857 V2 = DAG.getBitcast(BlendVT, V2);
9858 return DAG.getBitcast(
9860 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9870 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9871 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9872 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9875 llvm_unreachable("Not a supported integer vector type!");
9879 /// Try to lower as a blend of elements from two inputs followed by
9880 /// a single-input permutation.
9882 /// This matches the pattern where we can blend elements from two inputs and
9883 /// then reduce the shuffle to a single-input permutation.
9884 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9885 SDValue V1, SDValue V2,
9887 SelectionDAG &DAG) {
9888 // We build up the blend mask while checking whether a blend is a viable way
9889 // to reduce the shuffle.
9890 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9891 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9893 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9897 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9899 if (BlendMask[Mask[i] % Size] < 0)
9900 BlendMask[Mask[i] % Size] = Mask[i];
9901 else if (BlendMask[Mask[i] % Size] != Mask[i])
9902 return SDValue(); // Can't blend in the needed input!
9904 PermuteMask[i] = Mask[i] % Size;
9907 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9908 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9911 /// Generic routine to decompose a shuffle and blend into independent
9912 /// blends and permutes.
9914 /// This matches the extremely common pattern for handling combined
9915 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9916 /// operations. It will try to pick the best arrangement of shuffles and
9918 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9922 SelectionDAG &DAG) {
9923 // Shuffle the input elements into the desired positions in V1 and V2 and
9924 // blend them together.
9925 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9926 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9927 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9928 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9929 if (Mask[i] >= 0 && Mask[i] < Size) {
9930 V1Mask[i] = Mask[i];
9932 } else if (Mask[i] >= Size) {
9933 V2Mask[i] = Mask[i] - Size;
9934 BlendMask[i] = i + Size;
9937 // Try to lower with the simpler initial blend strategy unless one of the
9938 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9939 // shuffle may be able to fold with a load or other benefit. However, when
9940 // we'll have to do 2x as many shuffles in order to achieve this, blending
9941 // first is a better strategy.
9942 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9943 if (SDValue BlendPerm =
9944 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9947 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9948 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9949 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9952 /// Try to lower a vector shuffle as a rotation.
9954 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9955 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9956 ArrayRef<int> Mask) {
9957 int NumElts = Mask.size();
9959 // We need to detect various ways of spelling a rotation:
9960 // [11, 12, 13, 14, 15, 0, 1, 2]
9961 // [-1, 12, 13, 14, -1, -1, 1, -1]
9962 // [-1, -1, -1, -1, -1, -1, 1, 2]
9963 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9964 // [-1, 4, 5, 6, -1, -1, 9, -1]
9965 // [-1, 4, 5, 6, -1, -1, -1, -1]
9968 for (int i = 0; i < NumElts; ++i) {
9970 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9971 "Unexpected mask index.");
9975 // Determine where a rotated vector would have started.
9976 int StartIdx = i - (M % NumElts);
9978 // The identity rotation isn't interesting, stop.
9981 // If we found the tail of a vector the rotation must be the missing
9982 // front. If we found the head of a vector, it must be how much of the
9984 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9987 Rotation = CandidateRotation;
9988 else if (Rotation != CandidateRotation)
9989 // The rotations don't match, so we can't match this mask.
9992 // Compute which value this mask is pointing at.
9993 SDValue MaskV = M < NumElts ? V1 : V2;
9995 // Compute which of the two target values this index should be assigned
9996 // to. This reflects whether the high elements are remaining or the low
9997 // elements are remaining.
9998 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
10000 // Either set up this value if we've not encountered it before, or check
10001 // that it remains consistent.
10004 else if (TargetV != MaskV)
10005 // This may be a rotation, but it pulls from the inputs in some
10006 // unsupported interleaving.
10010 // Check that we successfully analyzed the mask, and normalize the results.
10011 assert(Rotation != 0 && "Failed to locate a viable rotation!");
10012 assert((Lo || Hi) && "Failed to find a rotated input vector!");
10024 /// Try to lower a vector shuffle as a byte rotation.
10026 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
10027 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
10028 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
10029 /// try to generically lower a vector shuffle through such an pattern. It
10030 /// does not check for the profitability of lowering either as PALIGNR or
10031 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
10032 /// This matches shuffle vectors that look like:
10034 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
10036 /// Essentially it concatenates V1 and V2, shifts right by some number of
10037 /// elements, and takes the low elements as the result. Note that while this is
10038 /// specified as a *right shift* because x86 is little-endian, it is a *left
10039 /// rotate* of the vector lanes.
10040 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
10041 ArrayRef<int> Mask) {
10042 // Don't accept any shuffles with zero elements.
10043 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10046 // PALIGNR works on 128-bit lanes.
10047 SmallVector<int, 16> RepeatedMask;
10048 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10051 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10055 // PALIGNR rotates bytes, so we need to scale the
10056 // rotation based on how many bytes are in the vector lane.
10057 int NumElts = RepeatedMask.size();
10058 int Scale = 16 / NumElts;
10059 return Rotation * Scale;
10062 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10063 SDValue V1, SDValue V2,
10064 ArrayRef<int> Mask,
10065 const X86Subtarget &Subtarget,
10066 SelectionDAG &DAG) {
10067 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10069 SDValue Lo = V1, Hi = V2;
10070 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10071 if (ByteRotation <= 0)
10074 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10076 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10077 Lo = DAG.getBitcast(ByteVT, Lo);
10078 Hi = DAG.getBitcast(ByteVT, Hi);
10080 // SSSE3 targets can use the palignr instruction.
10081 if (Subtarget.hasSSSE3()) {
10082 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10083 "512-bit PALIGNR requires BWI instructions");
10084 return DAG.getBitcast(
10085 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10086 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10089 assert(VT.is128BitVector() &&
10090 "Rotate-based lowering only supports 128-bit lowering!");
10091 assert(Mask.size() <= 16 &&
10092 "Can shuffle at most 16 bytes in a 128-bit vector!");
10093 assert(ByteVT == MVT::v16i8 &&
10094 "SSE2 rotate lowering only needed for v16i8!");
10096 // Default SSE2 implementation
10097 int LoByteShift = 16 - ByteRotation;
10098 int HiByteShift = ByteRotation;
10100 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10101 DAG.getConstant(LoByteShift, DL, MVT::i8));
10102 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10103 DAG.getConstant(HiByteShift, DL, MVT::i8));
10104 return DAG.getBitcast(VT,
10105 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10108 /// Try to lower a vector shuffle as a dword/qword rotation.
10110 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10111 /// rotation of the concatenation of two vectors; This routine will
10112 /// try to generically lower a vector shuffle through such an pattern.
10114 /// Essentially it concatenates V1 and V2, shifts right by some number of
10115 /// elements, and takes the low elements as the result. Note that while this is
10116 /// specified as a *right shift* because x86 is little-endian, it is a *left
10117 /// rotate* of the vector lanes.
10118 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10119 SDValue V1, SDValue V2,
10120 ArrayRef<int> Mask,
10121 const X86Subtarget &Subtarget,
10122 SelectionDAG &DAG) {
10123 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10124 "Only 32-bit and 64-bit elements are supported!");
10126 // 128/256-bit vectors are only supported with VLX.
10127 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10128 && "VLX required for 128/256-bit vectors");
10130 SDValue Lo = V1, Hi = V2;
10131 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10135 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10136 DAG.getConstant(Rotation, DL, MVT::i8));
10139 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10141 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10142 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10143 /// matches elements from one of the input vectors shuffled to the left or
10144 /// right with zeroable elements 'shifted in'. It handles both the strictly
10145 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10146 /// quad word lane.
10148 /// PSHL : (little-endian) left bit shift.
10149 /// [ zz, 0, zz, 2 ]
10150 /// [ -1, 4, zz, -1 ]
10151 /// PSRL : (little-endian) right bit shift.
10152 /// [ 1, zz, 3, zz]
10153 /// [ -1, -1, 7, zz]
10154 /// PSLLDQ : (little-endian) left byte shift
10155 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10156 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10157 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10158 /// PSRLDQ : (little-endian) right byte shift
10159 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10160 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10161 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10162 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10163 unsigned ScalarSizeInBits,
10164 ArrayRef<int> Mask, int MaskOffset,
10165 const APInt &Zeroable,
10166 const X86Subtarget &Subtarget) {
10167 int Size = Mask.size();
10168 unsigned SizeInBits = Size * ScalarSizeInBits;
10170 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10171 for (int i = 0; i < Size; i += Scale)
10172 for (int j = 0; j < Shift; ++j)
10173 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10179 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10180 for (int i = 0; i != Size; i += Scale) {
10181 unsigned Pos = Left ? i + Shift : i;
10182 unsigned Low = Left ? i : i + Shift;
10183 unsigned Len = Scale - Shift;
10184 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10188 int ShiftEltBits = ScalarSizeInBits * Scale;
10189 bool ByteShift = ShiftEltBits > 64;
10190 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10191 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10192 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10194 // Normalize the scale for byte shifts to still produce an i64 element
10196 Scale = ByteShift ? Scale / 2 : Scale;
10198 // We need to round trip through the appropriate type for the shift.
10199 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10200 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10201 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10202 return (int)ShiftAmt;
10205 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10206 // keep doubling the size of the integer elements up to that. We can
10207 // then shift the elements of the integer vector by whole multiples of
10208 // their width within the elements of the larger integer vector. Test each
10209 // multiple to see if we can find a match with the moved element indices
10210 // and that the shifted in elements are all zeroable.
10211 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10212 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10213 for (int Shift = 1; Shift != Scale; ++Shift)
10214 for (bool Left : {true, false})
10215 if (CheckZeros(Shift, Scale, Left)) {
10216 int ShiftAmt = MatchShift(Shift, Scale, Left);
10225 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10226 SDValue V2, ArrayRef<int> Mask,
10227 const APInt &Zeroable,
10228 const X86Subtarget &Subtarget,
10229 SelectionDAG &DAG) {
10230 int Size = Mask.size();
10231 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10237 // Try to match shuffle against V1 shift.
10238 int ShiftAmt = matchVectorShuffleAsShift(
10239 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10241 // If V1 failed, try to match shuffle against V2 shift.
10242 if (ShiftAmt < 0) {
10244 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10245 Mask, Size, Zeroable, Subtarget);
10252 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10253 "Illegal integer vector type");
10254 V = DAG.getBitcast(ShiftVT, V);
10255 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10256 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10257 return DAG.getBitcast(VT, V);
10260 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10261 // Remainder of lower half result is zero and upper half is all undef.
10262 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10263 ArrayRef<int> Mask, uint64_t &BitLen,
10264 uint64_t &BitIdx, const APInt &Zeroable) {
10265 int Size = Mask.size();
10266 int HalfSize = Size / 2;
10267 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10268 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10270 // Upper half must be undefined.
10271 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10274 // Determine the extraction length from the part of the
10275 // lower half that isn't zeroable.
10276 int Len = HalfSize;
10277 for (; Len > 0; --Len)
10278 if (!Zeroable[Len - 1])
10280 assert(Len > 0 && "Zeroable shuffle mask");
10282 // Attempt to match first Len sequential elements from the lower half.
10285 for (int i = 0; i != Len; ++i) {
10287 if (M == SM_SentinelUndef)
10289 SDValue &V = (M < Size ? V1 : V2);
10292 // The extracted elements must start at a valid index and all mask
10293 // elements must be in the lower half.
10294 if (i > M || M >= HalfSize)
10297 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10305 if (!Src || Idx < 0)
10308 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10309 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10310 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10315 // INSERTQ: Extract lowest Len elements from lower half of second source and
10316 // insert over first source, starting at Idx.
10317 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10318 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10319 ArrayRef<int> Mask, uint64_t &BitLen,
10320 uint64_t &BitIdx) {
10321 int Size = Mask.size();
10322 int HalfSize = Size / 2;
10323 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10325 // Upper half must be undefined.
10326 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10329 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10332 // Attempt to match first source from mask before insertion point.
10333 if (isUndefInRange(Mask, 0, Idx)) {
10335 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10337 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10343 // Extend the extraction length looking to match both the insertion of
10344 // the second source and the remaining elements of the first.
10345 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10347 int Len = Hi - Idx;
10349 // Match insertion.
10350 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10352 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10358 // Match the remaining elements of the lower half.
10359 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10361 } else if ((!Base || (Base == V1)) &&
10362 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10364 } else if ((!Base || (Base == V2)) &&
10365 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10372 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10373 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10383 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10384 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10385 SDValue V2, ArrayRef<int> Mask,
10386 const APInt &Zeroable,
10387 SelectionDAG &DAG) {
10388 uint64_t BitLen, BitIdx;
10389 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10390 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10391 DAG.getConstant(BitLen, DL, MVT::i8),
10392 DAG.getConstant(BitIdx, DL, MVT::i8));
10394 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10395 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10396 V2 ? V2 : DAG.getUNDEF(VT),
10397 DAG.getConstant(BitLen, DL, MVT::i8),
10398 DAG.getConstant(BitIdx, DL, MVT::i8));
10403 /// Lower a vector shuffle as a zero or any extension.
10405 /// Given a specific number of elements, element bit width, and extension
10406 /// stride, produce either a zero or any extension based on the available
10407 /// features of the subtarget. The extended elements are consecutive and
10408 /// begin and can start from an offsetted element index in the input; to
10409 /// avoid excess shuffling the offset must either being in the bottom lane
10410 /// or at the start of a higher lane. All extended elements must be from
10412 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10413 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10414 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10415 assert(Scale > 1 && "Need a scale to extend.");
10416 int EltBits = VT.getScalarSizeInBits();
10417 int NumElements = VT.getVectorNumElements();
10418 int NumEltsPerLane = 128 / EltBits;
10419 int OffsetLane = Offset / NumEltsPerLane;
10420 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10421 "Only 8, 16, and 32 bit elements can be extended.");
10422 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10423 assert(0 <= Offset && "Extension offset must be positive.");
10424 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10425 "Extension offset must be in the first lane or start an upper lane.");
10427 // Check that an index is in same lane as the base offset.
10428 auto SafeOffset = [&](int Idx) {
10429 return OffsetLane == (Idx / NumEltsPerLane);
10432 // Shift along an input so that the offset base moves to the first element.
10433 auto ShuffleOffset = [&](SDValue V) {
10437 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10438 for (int i = 0; i * Scale < NumElements; ++i) {
10439 int SrcIdx = i + Offset;
10440 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10442 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10445 // Found a valid zext mask! Try various lowering strategies based on the
10446 // input type and available ISA extensions.
10447 if (Subtarget.hasSSE41()) {
10448 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10449 // PUNPCK will catch this in a later shuffle match.
10450 if (Offset && Scale == 2 && VT.is128BitVector())
10452 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10453 NumElements / Scale);
10454 InputV = ShuffleOffset(InputV);
10455 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10456 return DAG.getBitcast(VT, InputV);
10459 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10461 // For any extends we can cheat for larger element sizes and use shuffle
10462 // instructions that can fold with a load and/or copy.
10463 if (AnyExt && EltBits == 32) {
10464 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10466 return DAG.getBitcast(
10467 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10468 DAG.getBitcast(MVT::v4i32, InputV),
10469 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10471 if (AnyExt && EltBits == 16 && Scale > 2) {
10472 int PSHUFDMask[4] = {Offset / 2, -1,
10473 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10474 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10475 DAG.getBitcast(MVT::v4i32, InputV),
10476 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10477 int PSHUFWMask[4] = {1, -1, -1, -1};
10478 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10479 return DAG.getBitcast(
10480 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10481 DAG.getBitcast(MVT::v8i16, InputV),
10482 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10485 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10487 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10488 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10489 assert(VT.is128BitVector() && "Unexpected vector width!");
10491 int LoIdx = Offset * EltBits;
10492 SDValue Lo = DAG.getBitcast(
10493 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10494 DAG.getConstant(EltBits, DL, MVT::i8),
10495 DAG.getConstant(LoIdx, DL, MVT::i8)));
10497 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10498 !SafeOffset(Offset + 1))
10499 return DAG.getBitcast(VT, Lo);
10501 int HiIdx = (Offset + 1) * EltBits;
10502 SDValue Hi = DAG.getBitcast(
10503 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10504 DAG.getConstant(EltBits, DL, MVT::i8),
10505 DAG.getConstant(HiIdx, DL, MVT::i8)));
10506 return DAG.getBitcast(VT,
10507 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10510 // If this would require more than 2 unpack instructions to expand, use
10511 // pshufb when available. We can only use more than 2 unpack instructions
10512 // when zero extending i8 elements which also makes it easier to use pshufb.
10513 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10514 assert(NumElements == 16 && "Unexpected byte vector width!");
10515 SDValue PSHUFBMask[16];
10516 for (int i = 0; i < 16; ++i) {
10517 int Idx = Offset + (i / Scale);
10518 PSHUFBMask[i] = DAG.getConstant(
10519 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10521 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10522 return DAG.getBitcast(
10523 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10524 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10527 // If we are extending from an offset, ensure we start on a boundary that
10528 // we can unpack from.
10529 int AlignToUnpack = Offset % (NumElements / Scale);
10530 if (AlignToUnpack) {
10531 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10532 for (int i = AlignToUnpack; i < NumElements; ++i)
10533 ShMask[i - AlignToUnpack] = i;
10534 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10535 Offset -= AlignToUnpack;
10538 // Otherwise emit a sequence of unpacks.
10540 unsigned UnpackLoHi = X86ISD::UNPCKL;
10541 if (Offset >= (NumElements / 2)) {
10542 UnpackLoHi = X86ISD::UNPCKH;
10543 Offset -= (NumElements / 2);
10546 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10547 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10548 : getZeroVector(InputVT, Subtarget, DAG, DL);
10549 InputV = DAG.getBitcast(InputVT, InputV);
10550 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10554 } while (Scale > 1);
10555 return DAG.getBitcast(VT, InputV);
10558 /// Try to lower a vector shuffle as a zero extension on any microarch.
10560 /// This routine will try to do everything in its power to cleverly lower
10561 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10562 /// check for the profitability of this lowering, it tries to aggressively
10563 /// match this pattern. It will use all of the micro-architectural details it
10564 /// can to emit an efficient lowering. It handles both blends with all-zero
10565 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10566 /// masking out later).
10568 /// The reason we have dedicated lowering for zext-style shuffles is that they
10569 /// are both incredibly common and often quite performance sensitive.
10570 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10571 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10572 const APInt &Zeroable, const X86Subtarget &Subtarget,
10573 SelectionDAG &DAG) {
10574 int Bits = VT.getSizeInBits();
10575 int NumLanes = Bits / 128;
10576 int NumElements = VT.getVectorNumElements();
10577 int NumEltsPerLane = NumElements / NumLanes;
10578 assert(VT.getScalarSizeInBits() <= 32 &&
10579 "Exceeds 32-bit integer zero extension limit");
10580 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10582 // Define a helper function to check a particular ext-scale and lower to it if
10584 auto Lower = [&](int Scale) -> SDValue {
10586 bool AnyExt = true;
10589 for (int i = 0; i < NumElements; ++i) {
10592 continue; // Valid anywhere but doesn't tell us anything.
10593 if (i % Scale != 0) {
10594 // Each of the extended elements need to be zeroable.
10598 // We no longer are in the anyext case.
10603 // Each of the base elements needs to be consecutive indices into the
10604 // same input vector.
10605 SDValue V = M < NumElements ? V1 : V2;
10606 M = M % NumElements;
10609 Offset = M - (i / Scale);
10610 } else if (InputV != V)
10611 return SDValue(); // Flip-flopping inputs.
10613 // Offset must start in the lowest 128-bit lane or at the start of an
10615 // FIXME: Is it ever worth allowing a negative base offset?
10616 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10617 (Offset % NumEltsPerLane) == 0))
10620 // If we are offsetting, all referenced entries must come from the same
10622 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10625 if ((M % NumElements) != (Offset + (i / Scale)))
10626 return SDValue(); // Non-consecutive strided elements.
10630 // If we fail to find an input, we have a zero-shuffle which should always
10631 // have already been handled.
10632 // FIXME: Maybe handle this here in case during blending we end up with one?
10636 // If we are offsetting, don't extend if we only match a single input, we
10637 // can always do better by using a basic PSHUF or PUNPCK.
10638 if (Offset != 0 && Matches < 2)
10641 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10642 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10645 // The widest scale possible for extending is to a 64-bit integer.
10646 assert(Bits % 64 == 0 &&
10647 "The number of bits in a vector must be divisible by 64 on x86!");
10648 int NumExtElements = Bits / 64;
10650 // Each iteration, try extending the elements half as much, but into twice as
10652 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10653 assert(NumElements % NumExtElements == 0 &&
10654 "The input vector size must be divisible by the extended size.");
10655 if (SDValue V = Lower(NumElements / NumExtElements))
10659 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10663 // Returns one of the source operands if the shuffle can be reduced to a
10664 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10665 auto CanZExtLowHalf = [&]() {
10666 for (int i = NumElements / 2; i != NumElements; ++i)
10669 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10671 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10676 if (SDValue V = CanZExtLowHalf()) {
10677 V = DAG.getBitcast(MVT::v2i64, V);
10678 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10679 return DAG.getBitcast(VT, V);
10682 // No viable ext lowering found.
10686 /// Try to get a scalar value for a specific element of a vector.
10688 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10689 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10690 SelectionDAG &DAG) {
10691 MVT VT = V.getSimpleValueType();
10692 MVT EltVT = VT.getVectorElementType();
10693 V = peekThroughBitcasts(V);
10695 // If the bitcasts shift the element size, we can't extract an equivalent
10696 // element from it.
10697 MVT NewVT = V.getSimpleValueType();
10698 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10701 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10702 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10703 // Ensure the scalar operand is the same size as the destination.
10704 // FIXME: Add support for scalar truncation where possible.
10705 SDValue S = V.getOperand(Idx);
10706 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10707 return DAG.getBitcast(EltVT, S);
10713 /// Helper to test for a load that can be folded with x86 shuffles.
10715 /// This is particularly important because the set of instructions varies
10716 /// significantly based on whether the operand is a load or not.
10717 static bool isShuffleFoldableLoad(SDValue V) {
10718 V = peekThroughBitcasts(V);
10719 return ISD::isNON_EXTLoad(V.getNode());
10722 /// Try to lower insertion of a single element into a zero vector.
10724 /// This is a common pattern that we have especially efficient patterns to lower
10725 /// across all subtarget feature sets.
10726 static SDValue lowerVectorShuffleAsElementInsertion(
10727 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10728 const APInt &Zeroable, const X86Subtarget &Subtarget,
10729 SelectionDAG &DAG) {
10731 MVT EltVT = VT.getVectorElementType();
10734 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10736 bool IsV1Zeroable = true;
10737 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10738 if (i != V2Index && !Zeroable[i]) {
10739 IsV1Zeroable = false;
10743 // Check for a single input from a SCALAR_TO_VECTOR node.
10744 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10745 // all the smarts here sunk into that routine. However, the current
10746 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10747 // vector shuffle lowering is dead.
10748 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10750 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10751 // We need to zext the scalar if it is smaller than an i32.
10752 V2S = DAG.getBitcast(EltVT, V2S);
10753 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10754 // Using zext to expand a narrow element won't work for non-zero
10759 // Zero-extend directly to i32.
10760 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10761 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10763 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10764 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10765 EltVT == MVT::i16) {
10766 // Either not inserting from the low element of the input or the input
10767 // element size is too small to use VZEXT_MOVL to clear the high bits.
10771 if (!IsV1Zeroable) {
10772 // If V1 can't be treated as a zero vector we have fewer options to lower
10773 // this. We can't support integer vectors or non-zero targets cheaply, and
10774 // the V1 elements can't be permuted in any way.
10775 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10776 if (!VT.isFloatingPoint() || V2Index != 0)
10778 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10779 V1Mask[V2Index] = -1;
10780 if (!isNoopShuffleMask(V1Mask))
10782 if (!VT.is128BitVector())
10785 // Otherwise, use MOVSD or MOVSS.
10786 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10787 "Only two types of floating point element types to handle!");
10788 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10792 // This lowering only works for the low element with floating point vectors.
10793 if (VT.isFloatingPoint() && V2Index != 0)
10796 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10798 V2 = DAG.getBitcast(VT, V2);
10800 if (V2Index != 0) {
10801 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10802 // the desired position. Otherwise it is more efficient to do a vector
10803 // shift left. We know that we can do a vector shift left because all
10804 // the inputs are zero.
10805 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10806 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10807 V2Shuffle[V2Index] = 0;
10808 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10810 V2 = DAG.getBitcast(MVT::v16i8, V2);
10812 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10813 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
10814 V2 = DAG.getBitcast(VT, V2);
10820 /// Try to lower broadcast of a single - truncated - integer element,
10821 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10823 /// This assumes we have AVX2.
10824 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10825 SDValue V0, int BroadcastIdx,
10826 const X86Subtarget &Subtarget,
10827 SelectionDAG &DAG) {
10828 assert(Subtarget.hasAVX2() &&
10829 "We can only lower integer broadcasts with AVX2!");
10831 EVT EltVT = VT.getVectorElementType();
10832 EVT V0VT = V0.getValueType();
10834 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10835 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10837 EVT V0EltVT = V0VT.getVectorElementType();
10838 if (!V0EltVT.isInteger())
10841 const unsigned EltSize = EltVT.getSizeInBits();
10842 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10844 // This is only a truncation if the original element type is larger.
10845 if (V0EltSize <= EltSize)
10848 assert(((V0EltSize % EltSize) == 0) &&
10849 "Scalar type sizes must all be powers of 2 on x86!");
10851 const unsigned V0Opc = V0.getOpcode();
10852 const unsigned Scale = V0EltSize / EltSize;
10853 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10855 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10856 V0Opc != ISD::BUILD_VECTOR)
10859 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10861 // If we're extracting non-least-significant bits, shift so we can truncate.
10862 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10863 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10864 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10865 if (const int OffsetIdx = BroadcastIdx % Scale)
10866 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10867 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
10869 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10870 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10873 /// Try to lower broadcast of a single element.
10875 /// For convenience, this code also bundles all of the subtarget feature set
10876 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10877 /// a convenient way to factor it out.
10878 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10879 SDValue V1, SDValue V2,
10880 ArrayRef<int> Mask,
10881 const X86Subtarget &Subtarget,
10882 SelectionDAG &DAG) {
10883 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10884 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10885 (Subtarget.hasAVX2() && VT.isInteger())))
10888 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10889 // we can only broadcast from a register with AVX2.
10890 unsigned NumElts = Mask.size();
10891 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10893 : X86ISD::VBROADCAST;
10894 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10896 // Check that the mask is a broadcast.
10897 int BroadcastIdx = -1;
10898 for (int i = 0; i != (int)NumElts; ++i) {
10899 SmallVector<int, 8> BroadcastMask(NumElts, i);
10900 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10906 if (BroadcastIdx < 0)
10908 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10909 "a sorted mask where the broadcast "
10912 // Go up the chain of (vector) values to find a scalar load that we can
10913 // combine with the broadcast.
10916 switch (V.getOpcode()) {
10917 case ISD::BITCAST: {
10918 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10919 SDValue VSrc = V.getOperand(0);
10920 unsigned NumEltBits = V.getScalarValueSizeInBits();
10921 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10922 if ((NumEltBits % NumSrcBits) == 0)
10923 BroadcastIdx *= (NumEltBits / NumSrcBits);
10924 else if ((NumSrcBits % NumEltBits) == 0 &&
10925 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10926 BroadcastIdx /= (NumSrcBits / NumEltBits);
10932 case ISD::CONCAT_VECTORS: {
10933 int OperandSize = Mask.size() / V.getNumOperands();
10934 V = V.getOperand(BroadcastIdx / OperandSize);
10935 BroadcastIdx %= OperandSize;
10938 case ISD::INSERT_SUBVECTOR: {
10939 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10940 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10944 int BeginIdx = (int)ConstantIdx->getZExtValue();
10946 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10947 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10948 BroadcastIdx -= BeginIdx;
10959 // Ensure the source vector and BroadcastIdx are for a suitable type.
10960 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10961 unsigned NumEltBits = VT.getScalarSizeInBits();
10962 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10963 if ((NumSrcBits % NumEltBits) == 0)
10964 BroadcastIdx *= (NumSrcBits / NumEltBits);
10965 else if ((NumEltBits % NumSrcBits) == 0 &&
10966 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10967 BroadcastIdx /= (NumEltBits / NumSrcBits);
10971 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10972 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10973 V = DAG.getBitcast(SrcVT, V);
10976 // Check if this is a broadcast of a scalar. We special case lowering
10977 // for scalars so that we can more effectively fold with loads.
10978 // First, look through bitcast: if the original value has a larger element
10979 // type than the shuffle, the broadcast element is in essence truncated.
10980 // Make that explicit to ease folding.
10981 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10982 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10983 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10984 return TruncBroadcast;
10986 MVT BroadcastVT = VT;
10988 // Peek through any bitcast (only useful for loads).
10989 SDValue BC = peekThroughBitcasts(V);
10991 // Also check the simpler case, where we can directly reuse the scalar.
10992 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10993 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10994 V = V.getOperand(BroadcastIdx);
10996 // If we can't broadcast from a register, check that the input is a load.
10997 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10999 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
11000 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11001 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
11002 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
11003 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
11008 // If we are broadcasting a load that is only used by the shuffle
11009 // then we can reduce the vector load to the broadcasted scalar load.
11010 LoadSDNode *Ld = cast<LoadSDNode>(BC);
11011 SDValue BaseAddr = Ld->getOperand(1);
11012 EVT SVT = BroadcastVT.getScalarType();
11013 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
11014 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
11015 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
11016 DAG.getMachineFunction().getMachineMemOperand(
11017 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
11018 DAG.makeEquivalentMemoryOrdering(Ld, V);
11019 } else if (!BroadcastFromReg) {
11020 // We can't broadcast from a vector register.
11022 } else if (BroadcastIdx != 0) {
11023 // We can only broadcast from the zero-element of a vector register,
11024 // but it can be advantageous to broadcast from the zero-element of a
11026 if (!VT.is256BitVector() && !VT.is512BitVector())
11029 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
11030 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11033 // Only broadcast the zero-element of a 128-bit subvector.
11034 unsigned EltSize = VT.getScalarSizeInBits();
11035 if (((BroadcastIdx * EltSize) % 128) != 0)
11038 // The shuffle input might have been a bitcast we looked through; look at
11039 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
11040 // later bitcast it to BroadcastVT.
11041 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11042 "Unexpected vector element size");
11043 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
11044 "Unexpected vector size");
11045 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11048 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11049 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11050 DAG.getBitcast(MVT::f64, V));
11052 // Bitcast back to the same scalar type as BroadcastVT.
11053 MVT SrcVT = V.getSimpleValueType();
11054 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11055 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11056 "Unexpected vector element size");
11057 if (SrcVT.isVector()) {
11058 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11059 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11061 SrcVT = BroadcastVT.getScalarType();
11063 V = DAG.getBitcast(SrcVT, V);
11066 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11067 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11068 V = DAG.getBitcast(MVT::f64, V);
11069 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11070 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11073 // We only support broadcasting from 128-bit vectors to minimize the
11074 // number of patterns we need to deal with in isel. So extract down to
11075 // 128-bits, removing as many bitcasts as possible.
11076 if (SrcVT.getSizeInBits() > 128) {
11077 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11078 128 / SrcVT.getScalarSizeInBits());
11079 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11080 V = DAG.getBitcast(ExtVT, V);
11083 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11086 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11087 // INSERTPS when the V1 elements are already in the correct locations
11088 // because otherwise we can just always use two SHUFPS instructions which
11089 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11090 // perform INSERTPS if a single V1 element is out of place and all V2
11091 // elements are zeroable.
11092 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11093 unsigned &InsertPSMask,
11094 const APInt &Zeroable,
11095 ArrayRef<int> Mask,
11096 SelectionDAG &DAG) {
11097 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11098 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11099 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11101 // Attempt to match INSERTPS with one element from VA or VB being
11102 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11104 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11105 ArrayRef<int> CandidateMask) {
11106 unsigned ZMask = 0;
11107 int VADstIndex = -1;
11108 int VBDstIndex = -1;
11109 bool VAUsedInPlace = false;
11111 for (int i = 0; i < 4; ++i) {
11112 // Synthesize a zero mask from the zeroable elements (includes undefs).
11118 // Flag if we use any VA inputs in place.
11119 if (i == CandidateMask[i]) {
11120 VAUsedInPlace = true;
11124 // We can only insert a single non-zeroable element.
11125 if (VADstIndex >= 0 || VBDstIndex >= 0)
11128 if (CandidateMask[i] < 4) {
11129 // VA input out of place for insertion.
11132 // VB input for insertion.
11137 // Don't bother if we have no (non-zeroable) element for insertion.
11138 if (VADstIndex < 0 && VBDstIndex < 0)
11141 // Determine element insertion src/dst indices. The src index is from the
11142 // start of the inserted vector, not the start of the concatenated vector.
11143 unsigned VBSrcIndex = 0;
11144 if (VADstIndex >= 0) {
11145 // If we have a VA input out of place, we use VA as the V2 element
11146 // insertion and don't use the original V2 at all.
11147 VBSrcIndex = CandidateMask[VADstIndex];
11148 VBDstIndex = VADstIndex;
11151 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11154 // If no V1 inputs are used in place, then the result is created only from
11155 // the zero mask and the V2 insertion - so remove V1 dependency.
11156 if (!VAUsedInPlace)
11157 VA = DAG.getUNDEF(MVT::v4f32);
11159 // Update V1, V2 and InsertPSMask accordingly.
11163 // Insert the V2 element into the desired position.
11164 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11165 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11169 if (matchAsInsertPS(V1, V2, Mask))
11172 // Commute and try again.
11173 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11174 ShuffleVectorSDNode::commuteMask(CommutedMask);
11175 if (matchAsInsertPS(V2, V1, CommutedMask))
11181 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11182 SDValue V2, ArrayRef<int> Mask,
11183 const APInt &Zeroable,
11184 SelectionDAG &DAG) {
11185 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11186 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11188 // Attempt to match the insertps pattern.
11189 unsigned InsertPSMask;
11190 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11193 // Insert the V2 element into the desired position.
11194 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11195 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11198 /// Try to lower a shuffle as a permute of the inputs followed by an
11199 /// UNPCK instruction.
11201 /// This specifically targets cases where we end up with alternating between
11202 /// the two inputs, and so can permute them into something that feeds a single
11203 /// UNPCK instruction. Note that this routine only targets integer vectors
11204 /// because for floating point vectors we have a generalized SHUFPS lowering
11205 /// strategy that handles everything that doesn't *exactly* match an unpack,
11206 /// making this clever lowering unnecessary.
11207 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11208 SDValue V1, SDValue V2,
11209 ArrayRef<int> Mask,
11210 SelectionDAG &DAG) {
11211 assert(!VT.isFloatingPoint() &&
11212 "This routine only supports integer vectors.");
11213 assert(VT.is128BitVector() &&
11214 "This routine only works on 128-bit vectors.");
11215 assert(!V2.isUndef() &&
11216 "This routine should only be used when blending two inputs.");
11217 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11219 int Size = Mask.size();
11222 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11224 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11226 bool UnpackLo = NumLoInputs >= NumHiInputs;
11228 auto TryUnpack = [&](int ScalarSize, int Scale) {
11229 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11230 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11232 for (int i = 0; i < Size; ++i) {
11236 // Each element of the unpack contains Scale elements from this mask.
11237 int UnpackIdx = i / Scale;
11239 // We only handle the case where V1 feeds the first slots of the unpack.
11240 // We rely on canonicalization to ensure this is the case.
11241 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11244 // Setup the mask for this input. The indexing is tricky as we have to
11245 // handle the unpack stride.
11246 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11247 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11251 // If we will have to shuffle both inputs to use the unpack, check whether
11252 // we can just unpack first and shuffle the result. If so, skip this unpack.
11253 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11254 !isNoopShuffleMask(V2Mask))
11257 // Shuffle the inputs into place.
11258 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11259 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11261 // Cast the inputs to the type we will use to unpack them.
11262 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11263 V1 = DAG.getBitcast(UnpackVT, V1);
11264 V2 = DAG.getBitcast(UnpackVT, V2);
11266 // Unpack the inputs and cast the result back to the desired type.
11267 return DAG.getBitcast(
11268 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11269 UnpackVT, V1, V2));
11272 // We try each unpack from the largest to the smallest to try and find one
11273 // that fits this mask.
11274 int OrigScalarSize = VT.getScalarSizeInBits();
11275 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11276 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11279 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11281 if (NumLoInputs == 0 || NumHiInputs == 0) {
11282 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11283 "We have to have *some* inputs!");
11284 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11286 // FIXME: We could consider the total complexity of the permute of each
11287 // possible unpacking. Or at the least we should consider how many
11288 // half-crossings are created.
11289 // FIXME: We could consider commuting the unpacks.
11291 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11292 for (int i = 0; i < Size; ++i) {
11296 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11299 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11301 return DAG.getVectorShuffle(
11302 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11304 DAG.getUNDEF(VT), PermMask);
11310 /// Handle lowering of 2-lane 64-bit floating point shuffles.
11312 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11313 /// support for floating point shuffles but not integer shuffles. These
11314 /// instructions will incur a domain crossing penalty on some chips though so
11315 /// it is better to avoid lowering through this for integer vectors where
11317 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11318 const APInt &Zeroable,
11319 SDValue V1, SDValue V2,
11320 const X86Subtarget &Subtarget,
11321 SelectionDAG &DAG) {
11322 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11323 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11324 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11326 if (V2.isUndef()) {
11327 // Check for being able to broadcast a single element.
11328 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11329 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11332 // Straight shuffle of a single input vector. Simulate this by using the
11333 // single input as both of the "inputs" to this instruction..
11334 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11336 if (Subtarget.hasAVX()) {
11337 // If we have AVX, we can use VPERMILPS which will allow folding a load
11338 // into the shuffle.
11339 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11340 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11343 return DAG.getNode(
11344 X86ISD::SHUFP, DL, MVT::v2f64,
11345 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11346 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11347 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11349 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11350 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11351 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11352 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11354 // When loading a scalar and then shuffling it into a vector we can often do
11355 // the insertion cheaply.
11356 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11357 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11359 // Try inverting the insertion since for v2 masks it is easy to do and we
11360 // can't reliably sort the mask one way or the other.
11361 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11362 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11363 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11364 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11367 // Try to use one of the special instruction patterns to handle two common
11368 // blend patterns if a zero-blend above didn't work.
11369 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11370 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11371 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11372 // We can either use a special instruction to load over the low double or
11373 // to move just the low double.
11374 return DAG.getNode(
11375 X86ISD::MOVSD, DL, MVT::v2f64, V2,
11376 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11378 if (Subtarget.hasSSE41())
11379 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11380 Zeroable, Subtarget, DAG))
11383 // Use dedicated unpack instructions for masks that match their pattern.
11385 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11388 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11389 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11390 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11393 /// Handle lowering of 2-lane 64-bit integer shuffles.
11395 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11396 /// the integer unit to minimize domain crossing penalties. However, for blends
11397 /// it falls back to the floating point shuffle operation with appropriate bit
11399 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11400 const APInt &Zeroable,
11401 SDValue V1, SDValue V2,
11402 const X86Subtarget &Subtarget,
11403 SelectionDAG &DAG) {
11404 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11405 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11406 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11408 if (V2.isUndef()) {
11409 // Check for being able to broadcast a single element.
11410 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11411 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11414 // Straight shuffle of a single input vector. For everything from SSE2
11415 // onward this has a single fast instruction with no scary immediates.
11416 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11417 V1 = DAG.getBitcast(MVT::v4i32, V1);
11418 int WidenedMask[4] = {
11419 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11420 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11421 return DAG.getBitcast(
11423 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11424 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11426 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11427 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11428 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11429 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11431 // Try to use shift instructions.
11432 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11433 Zeroable, Subtarget, DAG))
11436 // When loading a scalar and then shuffling it into a vector we can often do
11437 // the insertion cheaply.
11438 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11439 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11441 // Try inverting the insertion since for v2 masks it is easy to do and we
11442 // can't reliably sort the mask one way or the other.
11443 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11444 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11445 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11448 // We have different paths for blend lowering, but they all must use the
11449 // *exact* same predicate.
11450 bool IsBlendSupported = Subtarget.hasSSE41();
11451 if (IsBlendSupported)
11452 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11453 Zeroable, Subtarget, DAG))
11456 // Use dedicated unpack instructions for masks that match their pattern.
11458 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11461 // Try to use byte rotation instructions.
11462 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11463 if (Subtarget.hasSSSE3()) {
11464 if (Subtarget.hasVLX())
11465 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11466 Mask, Subtarget, DAG))
11469 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11470 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11474 // If we have direct support for blends, we should lower by decomposing into
11475 // a permute. That will be faster than the domain cross.
11476 if (IsBlendSupported)
11477 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11480 // We implement this with SHUFPD which is pretty lame because it will likely
11481 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11482 // However, all the alternatives are still more cycles and newer chips don't
11483 // have this problem. It would be really nice if x86 had better shuffles here.
11484 V1 = DAG.getBitcast(MVT::v2f64, V1);
11485 V2 = DAG.getBitcast(MVT::v2f64, V2);
11486 return DAG.getBitcast(MVT::v2i64,
11487 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11490 /// Test whether this can be lowered with a single SHUFPS instruction.
11492 /// This is used to disable more specialized lowerings when the shufps lowering
11493 /// will happen to be efficient.
11494 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11495 // This routine only handles 128-bit shufps.
11496 assert(Mask.size() == 4 && "Unsupported mask size!");
11497 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11498 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11499 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11500 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11502 // To lower with a single SHUFPS we need to have the low half and high half
11503 // each requiring a single input.
11504 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11506 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11512 /// Lower a vector shuffle using the SHUFPS instruction.
11514 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11515 /// It makes no assumptions about whether this is the *best* lowering, it simply
11517 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11518 ArrayRef<int> Mask, SDValue V1,
11519 SDValue V2, SelectionDAG &DAG) {
11520 SDValue LowV = V1, HighV = V2;
11521 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11523 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11525 if (NumV2Elements == 1) {
11526 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11528 // Compute the index adjacent to V2Index and in the same half by toggling
11530 int V2AdjIndex = V2Index ^ 1;
11532 if (Mask[V2AdjIndex] < 0) {
11533 // Handles all the cases where we have a single V2 element and an undef.
11534 // This will only ever happen in the high lanes because we commute the
11535 // vector otherwise.
11537 std::swap(LowV, HighV);
11538 NewMask[V2Index] -= 4;
11540 // Handle the case where the V2 element ends up adjacent to a V1 element.
11541 // To make this work, blend them together as the first step.
11542 int V1Index = V2AdjIndex;
11543 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11544 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11545 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11547 // Now proceed to reconstruct the final blend as we have the necessary
11548 // high or low half formed.
11555 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11556 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11558 } else if (NumV2Elements == 2) {
11559 if (Mask[0] < 4 && Mask[1] < 4) {
11560 // Handle the easy case where we have V1 in the low lanes and V2 in the
11564 } else if (Mask[2] < 4 && Mask[3] < 4) {
11565 // We also handle the reversed case because this utility may get called
11566 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11567 // arrange things in the right direction.
11573 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11574 // trying to place elements directly, just blend them and set up the final
11575 // shuffle to place them.
11577 // The first two blend mask elements are for V1, the second two are for
11579 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11580 Mask[2] < 4 ? Mask[2] : Mask[3],
11581 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11582 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11583 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11584 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11586 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11589 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11590 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11591 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11592 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11595 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11596 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11599 /// Lower 4-lane 32-bit floating point shuffles.
11601 /// Uses instructions exclusively from the floating point unit to minimize
11602 /// domain crossing penalties, as these are sufficient to implement all v4f32
11604 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11605 const APInt &Zeroable,
11606 SDValue V1, SDValue V2,
11607 const X86Subtarget &Subtarget,
11608 SelectionDAG &DAG) {
11609 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11610 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11611 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11613 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11615 if (NumV2Elements == 0) {
11616 // Check for being able to broadcast a single element.
11617 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11618 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11621 // Use even/odd duplicate instructions for masks that match their pattern.
11622 if (Subtarget.hasSSE3()) {
11623 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11624 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11625 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11626 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11629 if (Subtarget.hasAVX()) {
11630 // If we have AVX, we can use VPERMILPS which will allow folding a load
11631 // into the shuffle.
11632 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11633 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11636 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11637 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11638 if (!Subtarget.hasSSE2()) {
11639 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11640 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11641 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11642 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11645 // Otherwise, use a straight shuffle of a single input vector. We pass the
11646 // input vector to both operands to simulate this with a SHUFPS.
11647 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11648 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11651 // There are special ways we can lower some single-element blends. However, we
11652 // have custom ways we can lower more complex single-element blends below that
11653 // we defer to if both this and BLENDPS fail to match, so restrict this to
11654 // when the V2 input is targeting element 0 of the mask -- that is the fast
11656 if (NumV2Elements == 1 && Mask[0] >= 4)
11657 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11658 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11661 if (Subtarget.hasSSE41()) {
11662 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11663 Zeroable, Subtarget, DAG))
11666 // Use INSERTPS if we can complete the shuffle efficiently.
11668 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11671 if (!isSingleSHUFPSMask(Mask))
11672 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11673 DL, MVT::v4f32, V1, V2, Mask, DAG))
11677 // Use low/high mov instructions. These are only valid in SSE1 because
11678 // otherwise they are widened to v2f64 and never get here.
11679 if (!Subtarget.hasSSE2()) {
11680 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11681 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11682 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11683 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11686 // Use dedicated unpack instructions for masks that match their pattern.
11688 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11691 // Otherwise fall back to a SHUFPS lowering strategy.
11692 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11695 /// Lower 4-lane i32 vector shuffles.
11697 /// We try to handle these with integer-domain shuffles where we can, but for
11698 /// blends we use the floating point domain blend instructions.
11699 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11700 const APInt &Zeroable,
11701 SDValue V1, SDValue V2,
11702 const X86Subtarget &Subtarget,
11703 SelectionDAG &DAG) {
11704 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11705 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11706 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11708 // Whenever we can lower this as a zext, that instruction is strictly faster
11709 // than any alternative. It also allows us to fold memory operands into the
11710 // shuffle in many cases.
11711 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11712 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11715 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11717 if (NumV2Elements == 0) {
11718 // Check for being able to broadcast a single element.
11719 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11720 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11723 // Straight shuffle of a single input vector. For everything from SSE2
11724 // onward this has a single fast instruction with no scary immediates.
11725 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11726 // but we aren't actually going to use the UNPCK instruction because doing
11727 // so prevents folding a load into this instruction or making a copy.
11728 const int UnpackLoMask[] = {0, 0, 1, 1};
11729 const int UnpackHiMask[] = {2, 2, 3, 3};
11730 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11731 Mask = UnpackLoMask;
11732 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11733 Mask = UnpackHiMask;
11735 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11736 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11739 // Try to use shift instructions.
11740 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11741 Zeroable, Subtarget, DAG))
11744 // There are special ways we can lower some single-element blends.
11745 if (NumV2Elements == 1)
11746 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11747 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11750 // We have different paths for blend lowering, but they all must use the
11751 // *exact* same predicate.
11752 bool IsBlendSupported = Subtarget.hasSSE41();
11753 if (IsBlendSupported)
11754 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11755 Zeroable, Subtarget, DAG))
11758 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11762 // Use dedicated unpack instructions for masks that match their pattern.
11764 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11767 // Try to use byte rotation instructions.
11768 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11769 if (Subtarget.hasSSSE3()) {
11770 if (Subtarget.hasVLX())
11771 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11772 Mask, Subtarget, DAG))
11775 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11776 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11780 // Assume that a single SHUFPS is faster than an alternative sequence of
11781 // multiple instructions (even if the CPU has a domain penalty).
11782 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11783 if (!isSingleSHUFPSMask(Mask)) {
11784 // If we have direct support for blends, we should lower by decomposing into
11785 // a permute. That will be faster than the domain cross.
11786 if (IsBlendSupported)
11787 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11790 // Try to lower by permuting the inputs into an unpack instruction.
11791 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11792 DL, MVT::v4i32, V1, V2, Mask, DAG))
11796 // We implement this with SHUFPS because it can blend from two vectors.
11797 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11798 // up the inputs, bypassing domain shift penalties that we would incur if we
11799 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11801 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11802 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11803 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11804 return DAG.getBitcast(MVT::v4i32, ShufPS);
11807 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11808 /// shuffle lowering, and the most complex part.
11810 /// The lowering strategy is to try to form pairs of input lanes which are
11811 /// targeted at the same half of the final vector, and then use a dword shuffle
11812 /// to place them onto the right half, and finally unpack the paired lanes into
11813 /// their final position.
11815 /// The exact breakdown of how to form these dword pairs and align them on the
11816 /// correct sides is really tricky. See the comments within the function for
11817 /// more of the details.
11819 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11820 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11821 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11822 /// vector, form the analogous 128-bit 8-element Mask.
11823 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11824 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11825 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11826 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11827 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11829 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11830 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11831 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11833 // Attempt to directly match PSHUFLW or PSHUFHW.
11834 if (isUndefOrInRange(LoMask, 0, 4) &&
11835 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11836 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11837 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11839 if (isUndefOrInRange(HiMask, 4, 8) &&
11840 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11841 for (int i = 0; i != 4; ++i)
11842 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11843 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11844 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11847 SmallVector<int, 4> LoInputs;
11848 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11849 array_pod_sort(LoInputs.begin(), LoInputs.end());
11850 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11851 SmallVector<int, 4> HiInputs;
11852 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11853 array_pod_sort(HiInputs.begin(), HiInputs.end());
11854 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11856 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11857 int NumHToL = LoInputs.size() - NumLToL;
11859 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11860 int NumHToH = HiInputs.size() - NumLToH;
11861 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11862 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11863 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11864 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11866 // If we are shuffling values from one half - check how many different DWORD
11867 // pairs we need to create. If only 1 or 2 then we can perform this as a
11868 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11869 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11870 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11871 V = DAG.getNode(ShufWOp, DL, VT, V,
11872 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11873 V = DAG.getBitcast(PSHUFDVT, V);
11874 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11875 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11876 return DAG.getBitcast(VT, V);
11879 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11880 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11881 SmallVector<std::pair<int, int>, 4> DWordPairs;
11882 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11884 // Collect the different DWORD pairs.
11885 for (int DWord = 0; DWord != 4; ++DWord) {
11886 int M0 = Mask[2 * DWord + 0];
11887 int M1 = Mask[2 * DWord + 1];
11888 M0 = (M0 >= 0 ? M0 % 4 : M0);
11889 M1 = (M1 >= 0 ? M1 % 4 : M1);
11890 if (M0 < 0 && M1 < 0)
11893 bool Match = false;
11894 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11895 auto &DWordPair = DWordPairs[j];
11896 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11897 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11898 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11899 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11900 PSHUFDMask[DWord] = DOffset + j;
11906 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11907 DWordPairs.push_back(std::make_pair(M0, M1));
11911 if (DWordPairs.size() <= 2) {
11912 DWordPairs.resize(2, std::make_pair(-1, -1));
11913 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11914 DWordPairs[1].first, DWordPairs[1].second};
11915 if ((NumHToL + NumHToH) == 0)
11916 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11917 if ((NumLToL + NumLToH) == 0)
11918 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11922 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11923 // such inputs we can swap two of the dwords across the half mark and end up
11924 // with <=2 inputs to each half in each half. Once there, we can fall through
11925 // to the generic code below. For example:
11927 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11928 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11930 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11931 // and an existing 2-into-2 on the other half. In this case we may have to
11932 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11933 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11934 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11935 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11936 // half than the one we target for fixing) will be fixed when we re-enter this
11937 // path. We will also combine away any sequence of PSHUFD instructions that
11938 // result into a single instruction. Here is an example of the tricky case:
11940 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11941 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11943 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11945 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11946 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11948 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11949 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11951 // The result is fine to be handled by the generic logic.
11952 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11953 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11954 int AOffset, int BOffset) {
11955 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11956 "Must call this with A having 3 or 1 inputs from the A half.");
11957 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11958 "Must call this with B having 1 or 3 inputs from the B half.");
11959 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11960 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11962 bool ThreeAInputs = AToAInputs.size() == 3;
11964 // Compute the index of dword with only one word among the three inputs in
11965 // a half by taking the sum of the half with three inputs and subtracting
11966 // the sum of the actual three inputs. The difference is the remaining
11968 int ADWord, BDWord;
11969 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11970 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11971 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11972 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11973 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11974 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11975 int TripleNonInputIdx =
11976 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11977 TripleDWord = TripleNonInputIdx / 2;
11979 // We use xor with one to compute the adjacent DWord to whichever one the
11981 OneInputDWord = (OneInput / 2) ^ 1;
11983 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11984 // and BToA inputs. If there is also such a problem with the BToB and AToB
11985 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11986 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11987 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11988 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11989 // Compute how many inputs will be flipped by swapping these DWords. We
11991 // to balance this to ensure we don't form a 3-1 shuffle in the other
11993 int NumFlippedAToBInputs =
11994 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11995 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11996 int NumFlippedBToBInputs =
11997 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11998 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11999 if ((NumFlippedAToBInputs == 1 &&
12000 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
12001 (NumFlippedBToBInputs == 1 &&
12002 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
12003 // We choose whether to fix the A half or B half based on whether that
12004 // half has zero flipped inputs. At zero, we may not be able to fix it
12005 // with that half. We also bias towards fixing the B half because that
12006 // will more commonly be the high half, and we have to bias one way.
12007 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
12008 ArrayRef<int> Inputs) {
12009 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
12010 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
12011 // Determine whether the free index is in the flipped dword or the
12012 // unflipped dword based on where the pinned index is. We use this bit
12013 // in an xor to conditionally select the adjacent dword.
12014 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
12015 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12016 if (IsFixIdxInput == IsFixFreeIdxInput)
12018 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12019 assert(IsFixIdxInput != IsFixFreeIdxInput &&
12020 "We need to be changing the number of flipped inputs!");
12021 int PSHUFHalfMask[] = {0, 1, 2, 3};
12022 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
12024 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
12025 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
12026 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12028 for (int &M : Mask)
12029 if (M >= 0 && M == FixIdx)
12031 else if (M >= 0 && M == FixFreeIdx)
12034 if (NumFlippedBToBInputs != 0) {
12036 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
12037 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
12039 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
12040 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
12041 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12046 int PSHUFDMask[] = {0, 1, 2, 3};
12047 PSHUFDMask[ADWord] = BDWord;
12048 PSHUFDMask[BDWord] = ADWord;
12049 V = DAG.getBitcast(
12051 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12052 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12054 // Adjust the mask to match the new locations of A and B.
12055 for (int &M : Mask)
12056 if (M >= 0 && M/2 == ADWord)
12057 M = 2 * BDWord + M % 2;
12058 else if (M >= 0 && M/2 == BDWord)
12059 M = 2 * ADWord + M % 2;
12061 // Recurse back into this routine to re-compute state now that this isn't
12062 // a 3 and 1 problem.
12063 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12066 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12067 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12068 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12069 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12071 // At this point there are at most two inputs to the low and high halves from
12072 // each half. That means the inputs can always be grouped into dwords and
12073 // those dwords can then be moved to the correct half with a dword shuffle.
12074 // We use at most one low and one high word shuffle to collect these paired
12075 // inputs into dwords, and finally a dword shuffle to place them.
12076 int PSHUFLMask[4] = {-1, -1, -1, -1};
12077 int PSHUFHMask[4] = {-1, -1, -1, -1};
12078 int PSHUFDMask[4] = {-1, -1, -1, -1};
12080 // First fix the masks for all the inputs that are staying in their
12081 // original halves. This will then dictate the targets of the cross-half
12083 auto fixInPlaceInputs =
12084 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12085 MutableArrayRef<int> SourceHalfMask,
12086 MutableArrayRef<int> HalfMask, int HalfOffset) {
12087 if (InPlaceInputs.empty())
12089 if (InPlaceInputs.size() == 1) {
12090 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12091 InPlaceInputs[0] - HalfOffset;
12092 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12095 if (IncomingInputs.empty()) {
12096 // Just fix all of the in place inputs.
12097 for (int Input : InPlaceInputs) {
12098 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12099 PSHUFDMask[Input / 2] = Input / 2;
12104 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12105 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12106 InPlaceInputs[0] - HalfOffset;
12107 // Put the second input next to the first so that they are packed into
12108 // a dword. We find the adjacent index by toggling the low bit.
12109 int AdjIndex = InPlaceInputs[0] ^ 1;
12110 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12111 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12112 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12114 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12115 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12117 // Now gather the cross-half inputs and place them into a free dword of
12118 // their target half.
12119 // FIXME: This operation could almost certainly be simplified dramatically to
12120 // look more like the 3-1 fixing operation.
12121 auto moveInputsToRightHalf = [&PSHUFDMask](
12122 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12123 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12124 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12126 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12127 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12129 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12131 int LowWord = Word & ~1;
12132 int HighWord = Word | 1;
12133 return isWordClobbered(SourceHalfMask, LowWord) ||
12134 isWordClobbered(SourceHalfMask, HighWord);
12137 if (IncomingInputs.empty())
12140 if (ExistingInputs.empty()) {
12141 // Map any dwords with inputs from them into the right half.
12142 for (int Input : IncomingInputs) {
12143 // If the source half mask maps over the inputs, turn those into
12144 // swaps and use the swapped lane.
12145 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12146 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12147 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12148 Input - SourceOffset;
12149 // We have to swap the uses in our half mask in one sweep.
12150 for (int &M : HalfMask)
12151 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12153 else if (M == Input)
12154 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12156 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12157 Input - SourceOffset &&
12158 "Previous placement doesn't match!");
12160 // Note that this correctly re-maps both when we do a swap and when
12161 // we observe the other side of the swap above. We rely on that to
12162 // avoid swapping the members of the input list directly.
12163 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12166 // Map the input's dword into the correct half.
12167 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12168 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12170 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12172 "Previous placement doesn't match!");
12175 // And just directly shift any other-half mask elements to be same-half
12176 // as we will have mirrored the dword containing the element into the
12177 // same position within that half.
12178 for (int &M : HalfMask)
12179 if (M >= SourceOffset && M < SourceOffset + 4) {
12180 M = M - SourceOffset + DestOffset;
12181 assert(M >= 0 && "This should never wrap below zero!");
12186 // Ensure we have the input in a viable dword of its current half. This
12187 // is particularly tricky because the original position may be clobbered
12188 // by inputs being moved and *staying* in that half.
12189 if (IncomingInputs.size() == 1) {
12190 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12191 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12193 SourceHalfMask[InputFixed - SourceOffset] =
12194 IncomingInputs[0] - SourceOffset;
12195 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12197 IncomingInputs[0] = InputFixed;
12199 } else if (IncomingInputs.size() == 2) {
12200 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12201 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12202 // We have two non-adjacent or clobbered inputs we need to extract from
12203 // the source half. To do this, we need to map them into some adjacent
12204 // dword slot in the source mask.
12205 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12206 IncomingInputs[1] - SourceOffset};
12208 // If there is a free slot in the source half mask adjacent to one of
12209 // the inputs, place the other input in it. We use (Index XOR 1) to
12210 // compute an adjacent index.
12211 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12212 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12213 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12214 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12215 InputsFixed[1] = InputsFixed[0] ^ 1;
12216 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12217 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12218 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12219 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12220 InputsFixed[0] = InputsFixed[1] ^ 1;
12221 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12222 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12223 // The two inputs are in the same DWord but it is clobbered and the
12224 // adjacent DWord isn't used at all. Move both inputs to the free
12226 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12227 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12228 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12229 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12231 // The only way we hit this point is if there is no clobbering
12232 // (because there are no off-half inputs to this half) and there is no
12233 // free slot adjacent to one of the inputs. In this case, we have to
12234 // swap an input with a non-input.
12235 for (int i = 0; i < 4; ++i)
12236 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12237 "We can't handle any clobbers here!");
12238 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12239 "Cannot have adjacent inputs here!");
12241 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12242 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12244 // We also have to update the final source mask in this case because
12245 // it may need to undo the above swap.
12246 for (int &M : FinalSourceHalfMask)
12247 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12248 M = InputsFixed[1] + SourceOffset;
12249 else if (M == InputsFixed[1] + SourceOffset)
12250 M = (InputsFixed[0] ^ 1) + SourceOffset;
12252 InputsFixed[1] = InputsFixed[0] ^ 1;
12255 // Point everything at the fixed inputs.
12256 for (int &M : HalfMask)
12257 if (M == IncomingInputs[0])
12258 M = InputsFixed[0] + SourceOffset;
12259 else if (M == IncomingInputs[1])
12260 M = InputsFixed[1] + SourceOffset;
12262 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12263 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12266 llvm_unreachable("Unhandled input size!");
12269 // Now hoist the DWord down to the right half.
12270 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12271 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12272 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12273 for (int &M : HalfMask)
12274 for (int Input : IncomingInputs)
12276 M = FreeDWord * 2 + Input % 2;
12278 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12279 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12280 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12281 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12283 // Now enact all the shuffles we've computed to move the inputs into their
12285 if (!isNoopShuffleMask(PSHUFLMask))
12286 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12287 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12288 if (!isNoopShuffleMask(PSHUFHMask))
12289 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12290 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12291 if (!isNoopShuffleMask(PSHUFDMask))
12292 V = DAG.getBitcast(
12294 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12295 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12297 // At this point, each half should contain all its inputs, and we can then
12298 // just shuffle them into their final position.
12299 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12300 "Failed to lift all the high half inputs to the low mask!");
12301 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12302 "Failed to lift all the low half inputs to the high mask!");
12304 // Do a half shuffle for the low mask.
12305 if (!isNoopShuffleMask(LoMask))
12306 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12307 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12309 // Do a half shuffle with the high mask after shifting its values down.
12310 for (int &M : HiMask)
12313 if (!isNoopShuffleMask(HiMask))
12314 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12315 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12320 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12321 /// blend if only one input is used.
12322 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12323 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12324 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12326 SDValue V1Mask[16];
12327 SDValue V2Mask[16];
12331 int Size = Mask.size();
12332 int Scale = 16 / Size;
12333 for (int i = 0; i < 16; ++i) {
12334 if (Mask[i / Scale] < 0) {
12335 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12337 const int ZeroMask = 0x80;
12338 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12340 int V2Idx = Mask[i / Scale] < Size
12342 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12343 if (Zeroable[i / Scale])
12344 V1Idx = V2Idx = ZeroMask;
12345 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12346 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12347 V1InUse |= (ZeroMask != V1Idx);
12348 V2InUse |= (ZeroMask != V2Idx);
12353 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12354 DAG.getBitcast(MVT::v16i8, V1),
12355 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12357 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12358 DAG.getBitcast(MVT::v16i8, V2),
12359 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12361 // If we need shuffled inputs from both, blend the two.
12363 if (V1InUse && V2InUse)
12364 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12366 V = V1InUse ? V1 : V2;
12368 // Cast the result back to the correct type.
12369 return DAG.getBitcast(VT, V);
12372 /// Generic lowering of 8-lane i16 shuffles.
12374 /// This handles both single-input shuffles and combined shuffle/blends with
12375 /// two inputs. The single input shuffles are immediately delegated to
12376 /// a dedicated lowering routine.
12378 /// The blends are lowered in one of three fundamental ways. If there are few
12379 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12380 /// of the input is significantly cheaper when lowered as an interleaving of
12381 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12382 /// halves of the inputs separately (making them have relatively few inputs)
12383 /// and then concatenate them.
12384 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12385 const APInt &Zeroable,
12386 SDValue V1, SDValue V2,
12387 const X86Subtarget &Subtarget,
12388 SelectionDAG &DAG) {
12389 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12390 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12391 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12393 // Whenever we can lower this as a zext, that instruction is strictly faster
12394 // than any alternative.
12395 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12396 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12399 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12401 if (NumV2Inputs == 0) {
12402 // Check for being able to broadcast a single element.
12403 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12404 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12407 // Try to use shift instructions.
12408 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12409 Zeroable, Subtarget, DAG))
12412 // Use dedicated unpack instructions for masks that match their pattern.
12414 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12417 // Use dedicated pack instructions for masks that match their pattern.
12418 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12422 // Try to use byte rotation instructions.
12423 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12424 Mask, Subtarget, DAG))
12427 // Make a copy of the mask so it can be modified.
12428 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12429 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12430 MutableMask, Subtarget,
12434 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12435 "All single-input shuffles should be canonicalized to be V1-input "
12438 // Try to use shift instructions.
12439 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12440 Zeroable, Subtarget, DAG))
12443 // See if we can use SSE4A Extraction / Insertion.
12444 if (Subtarget.hasSSE4A())
12445 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12449 // There are special ways we can lower some single-element blends.
12450 if (NumV2Inputs == 1)
12451 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12452 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12455 // We have different paths for blend lowering, but they all must use the
12456 // *exact* same predicate.
12457 bool IsBlendSupported = Subtarget.hasSSE41();
12458 if (IsBlendSupported)
12459 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12460 Zeroable, Subtarget, DAG))
12463 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12467 // Use dedicated unpack instructions for masks that match their pattern.
12469 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12472 // Use dedicated pack instructions for masks that match their pattern.
12473 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12477 // Try to use byte rotation instructions.
12478 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12479 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12482 if (SDValue BitBlend =
12483 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12486 // Try to lower by permuting the inputs into an unpack instruction.
12487 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12491 // If we can't directly blend but can use PSHUFB, that will be better as it
12492 // can both shuffle and set up the inefficient blend.
12493 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12494 bool V1InUse, V2InUse;
12495 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12496 Zeroable, DAG, V1InUse, V2InUse);
12499 // We can always bit-blend if we have to so the fallback strategy is to
12500 // decompose into single-input permutes and blends.
12501 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12505 /// Check whether a compaction lowering can be done by dropping even
12506 /// elements and compute how many times even elements must be dropped.
12508 /// This handles shuffles which take every Nth element where N is a power of
12509 /// two. Example shuffle masks:
12511 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12512 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12513 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12514 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12515 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12516 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12518 /// Any of these lanes can of course be undef.
12520 /// This routine only supports N <= 3.
12521 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12524 /// \returns N above, or the number of times even elements must be dropped if
12525 /// there is such a number. Otherwise returns zero.
12526 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12527 bool IsSingleInput) {
12528 // The modulus for the shuffle vector entries is based on whether this is
12529 // a single input or not.
12530 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12531 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12532 "We should only be called with masks with a power-of-2 size!");
12534 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12536 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12537 // and 2^3 simultaneously. This is because we may have ambiguity with
12538 // partially undef inputs.
12539 bool ViableForN[3] = {true, true, true};
12541 for (int i = 0, e = Mask.size(); i < e; ++i) {
12542 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12547 bool IsAnyViable = false;
12548 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12549 if (ViableForN[j]) {
12550 uint64_t N = j + 1;
12552 // The shuffle mask must be equal to (i * 2^N) % M.
12553 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12554 IsAnyViable = true;
12556 ViableForN[j] = false;
12558 // Early exit if we exhaust the possible powers of two.
12563 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12567 // Return 0 as there is no viable power of two.
12571 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12572 ArrayRef<int> Mask, SDValue V1,
12573 SDValue V2, SelectionDAG &DAG) {
12574 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12575 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12577 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12579 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12581 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12584 /// Generic lowering of v16i8 shuffles.
12586 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12587 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12588 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12589 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12591 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12592 const APInt &Zeroable,
12593 SDValue V1, SDValue V2,
12594 const X86Subtarget &Subtarget,
12595 SelectionDAG &DAG) {
12596 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12597 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12598 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12600 // Try to use shift instructions.
12601 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12602 Zeroable, Subtarget, DAG))
12605 // Try to use byte rotation instructions.
12606 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12607 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12610 // Use dedicated pack instructions for masks that match their pattern.
12611 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12615 // Try to use a zext lowering.
12616 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12617 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12620 // See if we can use SSE4A Extraction / Insertion.
12621 if (Subtarget.hasSSE4A())
12622 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12626 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12628 // For single-input shuffles, there are some nicer lowering tricks we can use.
12629 if (NumV2Elements == 0) {
12630 // Check for being able to broadcast a single element.
12631 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12632 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12635 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12636 // Notably, this handles splat and partial-splat shuffles more efficiently.
12637 // However, it only makes sense if the pre-duplication shuffle simplifies
12638 // things significantly. Currently, this means we need to be able to
12639 // express the pre-duplication shuffle as an i16 shuffle.
12641 // FIXME: We should check for other patterns which can be widened into an
12642 // i16 shuffle as well.
12643 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12644 for (int i = 0; i < 16; i += 2)
12645 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12650 auto tryToWidenViaDuplication = [&]() -> SDValue {
12651 if (!canWidenViaDuplication(Mask))
12653 SmallVector<int, 4> LoInputs;
12654 copy_if(Mask, std::back_inserter(LoInputs),
12655 [](int M) { return M >= 0 && M < 8; });
12656 array_pod_sort(LoInputs.begin(), LoInputs.end());
12657 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12659 SmallVector<int, 4> HiInputs;
12660 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12661 array_pod_sort(HiInputs.begin(), HiInputs.end());
12662 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12665 bool TargetLo = LoInputs.size() >= HiInputs.size();
12666 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12667 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12669 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12670 SmallDenseMap<int, int, 8> LaneMap;
12671 for (int I : InPlaceInputs) {
12672 PreDupI16Shuffle[I/2] = I/2;
12675 int j = TargetLo ? 0 : 4, je = j + 4;
12676 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12677 // Check if j is already a shuffle of this input. This happens when
12678 // there are two adjacent bytes after we move the low one.
12679 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12680 // If we haven't yet mapped the input, search for a slot into which
12682 while (j < je && PreDupI16Shuffle[j] >= 0)
12686 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12689 // Map this input with the i16 shuffle.
12690 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12693 // Update the lane map based on the mapping we ended up with.
12694 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12696 V1 = DAG.getBitcast(
12698 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12699 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12701 // Unpack the bytes to form the i16s that will be shuffled into place.
12702 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12703 MVT::v16i8, V1, V1);
12705 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12706 for (int i = 0; i < 16; ++i)
12707 if (Mask[i] >= 0) {
12708 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12709 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12710 if (PostDupI16Shuffle[i / 2] < 0)
12711 PostDupI16Shuffle[i / 2] = MappedMask;
12713 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12714 "Conflicting entries in the original shuffle!");
12716 return DAG.getBitcast(
12718 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12719 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12721 if (SDValue V = tryToWidenViaDuplication())
12725 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12729 // Use dedicated unpack instructions for masks that match their pattern.
12731 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12734 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12735 // with PSHUFB. It is important to do this before we attempt to generate any
12736 // blends but after all of the single-input lowerings. If the single input
12737 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12738 // want to preserve that and we can DAG combine any longer sequences into
12739 // a PSHUFB in the end. But once we start blending from multiple inputs,
12740 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12741 // and there are *very* few patterns that would actually be faster than the
12742 // PSHUFB approach because of its ability to zero lanes.
12744 // FIXME: The only exceptions to the above are blends which are exact
12745 // interleavings with direct instructions supporting them. We currently don't
12746 // handle those well here.
12747 if (Subtarget.hasSSSE3()) {
12748 bool V1InUse = false;
12749 bool V2InUse = false;
12751 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12752 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12754 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12755 // do so. This avoids using them to handle blends-with-zero which is
12756 // important as a single pshufb is significantly faster for that.
12757 if (V1InUse && V2InUse) {
12758 if (Subtarget.hasSSE41())
12759 if (SDValue Blend = lowerVectorShuffleAsBlend(
12760 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12763 // We can use an unpack to do the blending rather than an or in some
12764 // cases. Even though the or may be (very minorly) more efficient, we
12765 // preference this lowering because there are common cases where part of
12766 // the complexity of the shuffles goes away when we do the final blend as
12768 // FIXME: It might be worth trying to detect if the unpack-feeding
12769 // shuffles will both be pshufb, in which case we shouldn't bother with
12771 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12772 DL, MVT::v16i8, V1, V2, Mask, DAG))
12775 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12776 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12777 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12783 // There are special ways we can lower some single-element blends.
12784 if (NumV2Elements == 1)
12785 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12786 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12789 if (SDValue BitBlend =
12790 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12793 // Check whether a compaction lowering can be done. This handles shuffles
12794 // which take every Nth element for some even N. See the helper function for
12797 // We special case these as they can be particularly efficiently handled with
12798 // the PACKUSB instruction on x86 and they show up in common patterns of
12799 // rearranging bytes to truncate wide elements.
12800 bool IsSingleInput = V2.isUndef();
12801 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12802 // NumEvenDrops is the power of two stride of the elements. Another way of
12803 // thinking about it is that we need to drop the even elements this many
12804 // times to get the original input.
12806 // First we need to zero all the dropped bytes.
12807 assert(NumEvenDrops <= 3 &&
12808 "No support for dropping even elements more than 3 times.");
12809 // We use the mask type to pick which bytes are preserved based on how many
12810 // elements are dropped.
12811 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12812 SDValue ByteClearMask = DAG.getBitcast(
12813 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12814 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12815 if (!IsSingleInput)
12816 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12818 // Now pack things back together.
12819 V1 = DAG.getBitcast(MVT::v8i16, V1);
12820 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12821 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12822 for (int i = 1; i < NumEvenDrops; ++i) {
12823 Result = DAG.getBitcast(MVT::v8i16, Result);
12824 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12830 // Handle multi-input cases by blending single-input shuffles.
12831 if (NumV2Elements > 0)
12832 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12835 // The fallback path for single-input shuffles widens this into two v8i16
12836 // vectors with unpacks, shuffles those, and then pulls them back together
12840 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12841 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12842 for (int i = 0; i < 16; ++i)
12844 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12846 SDValue VLoHalf, VHiHalf;
12847 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12848 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12850 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12851 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12852 // Use a mask to drop the high bytes.
12853 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12854 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12855 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12857 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12858 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12860 // Squash the masks to point directly into VLoHalf.
12861 for (int &M : LoBlendMask)
12864 for (int &M : HiBlendMask)
12868 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12869 // VHiHalf so that we can blend them as i16s.
12870 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12872 VLoHalf = DAG.getBitcast(
12873 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12874 VHiHalf = DAG.getBitcast(
12875 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12878 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12879 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12881 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12884 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
12886 /// This routine breaks down the specific type of 128-bit shuffle and
12887 /// dispatches to the lowering routines accordingly.
12888 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12889 MVT VT, SDValue V1, SDValue V2,
12890 const APInt &Zeroable,
12891 const X86Subtarget &Subtarget,
12892 SelectionDAG &DAG) {
12893 switch (VT.SimpleTy) {
12895 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12897 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12899 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12901 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12903 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12905 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12908 llvm_unreachable("Unimplemented!");
12912 /// Generic routine to split vector shuffle into half-sized shuffles.
12914 /// This routine just extracts two subvectors, shuffles them independently, and
12915 /// then concatenates them back together. This should work effectively with all
12916 /// AVX vector shuffle types.
12917 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12918 SDValue V2, ArrayRef<int> Mask,
12919 SelectionDAG &DAG) {
12920 assert(VT.getSizeInBits() >= 256 &&
12921 "Only for 256-bit or wider vector shuffles!");
12922 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12923 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12925 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12926 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12928 int NumElements = VT.getVectorNumElements();
12929 int SplitNumElements = NumElements / 2;
12930 MVT ScalarVT = VT.getVectorElementType();
12931 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12933 // Rather than splitting build-vectors, just build two narrower build
12934 // vectors. This helps shuffling with splats and zeros.
12935 auto SplitVector = [&](SDValue V) {
12936 V = peekThroughBitcasts(V);
12938 MVT OrigVT = V.getSimpleValueType();
12939 int OrigNumElements = OrigVT.getVectorNumElements();
12940 int OrigSplitNumElements = OrigNumElements / 2;
12941 MVT OrigScalarVT = OrigVT.getVectorElementType();
12942 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12946 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12948 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12949 DAG.getIntPtrConstant(0, DL));
12950 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12951 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12954 SmallVector<SDValue, 16> LoOps, HiOps;
12955 for (int i = 0; i < OrigSplitNumElements; ++i) {
12956 LoOps.push_back(BV->getOperand(i));
12957 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12959 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12960 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12962 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12963 DAG.getBitcast(SplitVT, HiV));
12966 SDValue LoV1, HiV1, LoV2, HiV2;
12967 std::tie(LoV1, HiV1) = SplitVector(V1);
12968 std::tie(LoV2, HiV2) = SplitVector(V2);
12970 // Now create two 4-way blends of these half-width vectors.
12971 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12972 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12973 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12974 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12975 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12976 for (int i = 0; i < SplitNumElements; ++i) {
12977 int M = HalfMask[i];
12978 if (M >= NumElements) {
12979 if (M >= NumElements + SplitNumElements)
12983 V2BlendMask[i] = M - NumElements;
12984 BlendMask[i] = SplitNumElements + i;
12985 } else if (M >= 0) {
12986 if (M >= SplitNumElements)
12990 V1BlendMask[i] = M;
12995 // Because the lowering happens after all combining takes place, we need to
12996 // manually combine these blend masks as much as possible so that we create
12997 // a minimal number of high-level vector shuffle nodes.
12999 // First try just blending the halves of V1 or V2.
13000 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
13001 return DAG.getUNDEF(SplitVT);
13002 if (!UseLoV2 && !UseHiV2)
13003 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13004 if (!UseLoV1 && !UseHiV1)
13005 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13007 SDValue V1Blend, V2Blend;
13008 if (UseLoV1 && UseHiV1) {
13010 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
13012 // We only use half of V1 so map the usage down into the final blend mask.
13013 V1Blend = UseLoV1 ? LoV1 : HiV1;
13014 for (int i = 0; i < SplitNumElements; ++i)
13015 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
13016 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
13018 if (UseLoV2 && UseHiV2) {
13020 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13022 // We only use half of V2 so map the usage down into the final blend mask.
13023 V2Blend = UseLoV2 ? LoV2 : HiV2;
13024 for (int i = 0; i < SplitNumElements; ++i)
13025 if (BlendMask[i] >= SplitNumElements)
13026 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
13028 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
13030 SDValue Lo = HalfBlend(LoMask);
13031 SDValue Hi = HalfBlend(HiMask);
13032 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13035 /// Either split a vector in halves or decompose the shuffles and the
13038 /// This is provided as a good fallback for many lowerings of non-single-input
13039 /// shuffles with more than one 128-bit lane. In those cases, we want to select
13040 /// between splitting the shuffle into 128-bit components and stitching those
13041 /// back together vs. extracting the single-input shuffles and blending those
13043 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
13044 SDValue V1, SDValue V2,
13045 ArrayRef<int> Mask,
13046 SelectionDAG &DAG) {
13047 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13048 "shuffles as it could then recurse on itself.");
13049 int Size = Mask.size();
13051 // If this can be modeled as a broadcast of two elements followed by a blend,
13052 // prefer that lowering. This is especially important because broadcasts can
13053 // often fold with memory operands.
13054 auto DoBothBroadcast = [&] {
13055 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13058 if (V2BroadcastIdx < 0)
13059 V2BroadcastIdx = M - Size;
13060 else if (M - Size != V2BroadcastIdx)
13062 } else if (M >= 0) {
13063 if (V1BroadcastIdx < 0)
13064 V1BroadcastIdx = M;
13065 else if (M != V1BroadcastIdx)
13070 if (DoBothBroadcast())
13071 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13074 // If the inputs all stem from a single 128-bit lane of each input, then we
13075 // split them rather than blending because the split will decompose to
13076 // unusually few instructions.
13077 int LaneCount = VT.getSizeInBits() / 128;
13078 int LaneSize = Size / LaneCount;
13079 SmallBitVector LaneInputs[2];
13080 LaneInputs[0].resize(LaneCount, false);
13081 LaneInputs[1].resize(LaneCount, false);
13082 for (int i = 0; i < Size; ++i)
13084 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13085 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13086 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13088 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13089 // that the decomposed single-input shuffles don't end up here.
13090 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
13093 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13094 /// a permutation and blend of those lanes.
13096 /// This essentially blends the out-of-lane inputs to each lane into the lane
13097 /// from a permuted copy of the vector. This lowering strategy results in four
13098 /// instructions in the worst case for a single-input cross lane shuffle which
13099 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13100 /// of. Special cases for each particular shuffle pattern should be handled
13101 /// prior to trying this lowering.
13102 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13103 SDValue V1, SDValue V2,
13104 ArrayRef<int> Mask,
13106 const X86Subtarget &Subtarget) {
13107 // FIXME: This should probably be generalized for 512-bit vectors as well.
13108 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13109 int Size = Mask.size();
13110 int LaneSize = Size / 2;
13112 // If there are only inputs from one 128-bit lane, splitting will in fact be
13113 // less expensive. The flags track whether the given lane contains an element
13114 // that crosses to another lane.
13115 if (!Subtarget.hasAVX2()) {
13116 bool LaneCrossing[2] = {false, false};
13117 for (int i = 0; i < Size; ++i)
13118 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13119 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13120 if (!LaneCrossing[0] || !LaneCrossing[1])
13121 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13123 bool LaneUsed[2] = {false, false};
13124 for (int i = 0; i < Size; ++i)
13126 LaneUsed[(Mask[i] / LaneSize)] = true;
13127 if (!LaneUsed[0] || !LaneUsed[1])
13128 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13131 assert(V2.isUndef() &&
13132 "This last part of this routine only works on single input shuffles");
13134 SmallVector<int, 32> FlippedBlendMask(Size);
13135 for (int i = 0; i < Size; ++i)
13136 FlippedBlendMask[i] =
13137 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13139 : Mask[i] % LaneSize +
13140 (i / LaneSize) * LaneSize + Size);
13142 // Flip the vector, and blend the results which should now be in-lane.
13143 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13144 SDValue Flipped = DAG.getBitcast(PVT, V1);
13145 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13147 Flipped = DAG.getBitcast(VT, Flipped);
13148 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13151 /// Handle lowering 2-lane 128-bit shuffles.
13152 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13153 SDValue V2, ArrayRef<int> Mask,
13154 const APInt &Zeroable,
13155 const X86Subtarget &Subtarget,
13156 SelectionDAG &DAG) {
13157 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13158 if (Subtarget.hasAVX2() && V2.isUndef())
13161 SmallVector<int, 4> WidenedMask;
13162 if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
13165 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13166 bool IsHighZero = (Zeroable & 0xc) == 0xc;
13168 // Try to use an insert into a zero vector.
13169 if (WidenedMask[0] == 0 && IsHighZero) {
13170 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13171 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13172 DAG.getIntPtrConstant(0, DL));
13173 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13174 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13175 DAG.getIntPtrConstant(0, DL));
13178 // TODO: If minimizing size and one of the inputs is a zero vector and the
13179 // the zero vector has only one use, we could use a VPERM2X128 to save the
13180 // instruction bytes needed to explicitly generate the zero vector.
13182 // Blends are faster and handle all the non-lane-crossing cases.
13183 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
13184 Zeroable, Subtarget, DAG))
13187 // If either input operand is a zero vector, use VPERM2X128 because its mask
13188 // allows us to replace the zero input with an implicit zero.
13189 if (!IsLowZero && !IsHighZero) {
13190 // Check for patterns which can be matched with a single insert of a 128-bit
13192 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13193 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13195 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13196 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13197 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13198 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13199 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13200 OnlyUsesV1 ? V1 : V2,
13201 DAG.getIntPtrConstant(0, DL));
13202 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13203 DAG.getIntPtrConstant(2, DL));
13207 // Try to use SHUF128 if possible.
13208 if (Subtarget.hasVLX()) {
13209 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13210 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13211 ((WidenedMask[1] % 2) << 1);
13212 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13213 DAG.getConstant(PermMask, DL, MVT::i8));
13218 // Otherwise form a 128-bit permutation. After accounting for undefs,
13219 // convert the 64-bit shuffle mask selection values into 128-bit
13220 // selection bits by dividing the indexes by 2 and shifting into positions
13221 // defined by a vperm2*128 instruction's immediate control byte.
13223 // The immediate permute control byte looks like this:
13224 // [1:0] - select 128 bits from sources for low half of destination
13226 // [3] - zero low half of destination
13227 // [5:4] - select 128 bits from sources for high half of destination
13229 // [7] - zero high half of destination
13231 assert((WidenedMask[0] >= 0 || IsLowZero) &&
13232 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
13234 unsigned PermMask = 0;
13235 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13236 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13238 // Check the immediate mask and replace unused sources with undef.
13239 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13240 V1 = DAG.getUNDEF(VT);
13241 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13242 V2 = DAG.getUNDEF(VT);
13244 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13245 DAG.getConstant(PermMask, DL, MVT::i8));
13248 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
13249 /// shuffling each lane.
13251 /// This will only succeed when the result of fixing the 128-bit lanes results
13252 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13253 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13254 /// the lane crosses early and then use simpler shuffles within each lane.
13256 /// FIXME: It might be worthwhile at some point to support this without
13257 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13258 /// in x86 only floating point has interesting non-repeating shuffles, and even
13259 /// those are still *marginally* more expensive.
13260 static SDValue lowerVectorShuffleByMerging128BitLanes(
13261 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13262 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13263 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13265 int Size = Mask.size();
13266 int LaneSize = 128 / VT.getScalarSizeInBits();
13267 int NumLanes = Size / LaneSize;
13268 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13270 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13271 // check whether the in-128-bit lane shuffles share a repeating pattern.
13272 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13273 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13274 for (int i = 0; i < Size; ++i) {
13278 int j = i / LaneSize;
13280 if (Lanes[j] < 0) {
13281 // First entry we've seen for this lane.
13282 Lanes[j] = Mask[i] / LaneSize;
13283 } else if (Lanes[j] != Mask[i] / LaneSize) {
13284 // This doesn't match the lane selected previously!
13288 // Check that within each lane we have a consistent shuffle mask.
13289 int k = i % LaneSize;
13290 if (InLaneMask[k] < 0) {
13291 InLaneMask[k] = Mask[i] % LaneSize;
13292 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13293 // This doesn't fit a repeating in-lane mask.
13298 // First shuffle the lanes into place.
13299 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13300 VT.getSizeInBits() / 64);
13301 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13302 for (int i = 0; i < NumLanes; ++i)
13303 if (Lanes[i] >= 0) {
13304 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13305 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13308 V1 = DAG.getBitcast(LaneVT, V1);
13309 V2 = DAG.getBitcast(LaneVT, V2);
13310 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13312 // Cast it back to the type we actually want.
13313 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13315 // Now do a simple shuffle that isn't lane crossing.
13316 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13317 for (int i = 0; i < Size; ++i)
13319 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13320 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13321 "Must not introduce lane crosses at this point!");
13323 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13326 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13327 /// This allows for fast cases such as subvector extraction/insertion
13328 /// or shuffling smaller vector types which can lower more efficiently.
13329 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13330 SDValue V1, SDValue V2,
13331 ArrayRef<int> Mask,
13332 const X86Subtarget &Subtarget,
13333 SelectionDAG &DAG) {
13334 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13335 "Expected 256-bit or 512-bit vector");
13337 unsigned NumElts = VT.getVectorNumElements();
13338 unsigned HalfNumElts = NumElts / 2;
13339 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13341 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13342 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13343 if (!UndefLower && !UndefUpper)
13346 // Upper half is undef and lower half is whole upper subvector.
13347 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13349 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13350 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13351 DAG.getIntPtrConstant(HalfNumElts, DL));
13352 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13353 DAG.getIntPtrConstant(0, DL));
13356 // Lower half is undef and upper half is whole lower subvector.
13357 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13359 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13360 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13361 DAG.getIntPtrConstant(0, DL));
13362 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13363 DAG.getIntPtrConstant(HalfNumElts, DL));
13366 // If the shuffle only uses two of the four halves of the input operands,
13367 // then extract them and perform the 'half' shuffle at half width.
13368 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13369 int HalfIdx1 = -1, HalfIdx2 = -1;
13370 SmallVector<int, 8> HalfMask(HalfNumElts);
13371 unsigned Offset = UndefLower ? HalfNumElts : 0;
13372 for (unsigned i = 0; i != HalfNumElts; ++i) {
13373 int M = Mask[i + Offset];
13379 // Determine which of the 4 half vectors this element is from.
13380 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13381 int HalfIdx = M / HalfNumElts;
13383 // Determine the element index into its half vector source.
13384 int HalfElt = M % HalfNumElts;
13386 // We can shuffle with up to 2 half vectors, set the new 'half'
13387 // shuffle mask accordingly.
13388 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13389 HalfMask[i] = HalfElt;
13390 HalfIdx1 = HalfIdx;
13393 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13394 HalfMask[i] = HalfElt + HalfNumElts;
13395 HalfIdx2 = HalfIdx;
13399 // Too many half vectors referenced.
13402 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13404 // Only shuffle the halves of the inputs when useful.
13405 int NumLowerHalves =
13406 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13407 int NumUpperHalves =
13408 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13410 // uuuuXXXX - don't extract uppers just to insert again.
13411 if (UndefLower && NumUpperHalves != 0)
13414 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13415 if (UndefUpper && NumUpperHalves == 2)
13418 // AVX2 - XXXXuuuu - always extract lowers.
13419 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13420 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13421 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13423 // AVX2 supports variable 32-bit element cross-lane shuffles.
13424 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13425 // XXXXuuuu - don't extract lowers and uppers.
13426 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13431 // AVX512 - XXXXuuuu - always extract lowers.
13432 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13435 auto GetHalfVector = [&](int HalfIdx) {
13437 return DAG.getUNDEF(HalfVT);
13438 SDValue V = (HalfIdx < 2 ? V1 : V2);
13439 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13440 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13441 DAG.getIntPtrConstant(HalfIdx, DL));
13444 SDValue Half1 = GetHalfVector(HalfIdx1);
13445 SDValue Half2 = GetHalfVector(HalfIdx2);
13446 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13447 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13448 DAG.getIntPtrConstant(Offset, DL));
13451 /// Test whether the specified input (0 or 1) is in-place blended by the
13454 /// This returns true if the elements from a particular input are already in the
13455 /// slot required by the given mask and require no permutation.
13456 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13457 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13458 int Size = Mask.size();
13459 for (int i = 0; i < Size; ++i)
13460 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13466 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13467 /// every lane can be represented as the same repeating mask - allowing us to
13468 /// shuffle the sources with the repeating shuffle and then permute the result
13469 /// to the destination lanes.
13470 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13471 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13472 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13473 int NumElts = VT.getVectorNumElements();
13474 int NumLanes = VT.getSizeInBits() / 128;
13475 int NumLaneElts = NumElts / NumLanes;
13477 // On AVX2 we may be able to just shuffle the lowest elements and then
13478 // broadcast the result.
13479 if (Subtarget.hasAVX2()) {
13480 for (unsigned BroadcastSize : {16, 32, 64}) {
13481 if (BroadcastSize <= VT.getScalarSizeInBits())
13483 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13485 // Attempt to match a repeating pattern every NumBroadcastElts,
13486 // accounting for UNDEFs but only references the lowest 128-bit
13487 // lane of the inputs.
13488 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13489 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13490 for (int j = 0; j != NumBroadcastElts; ++j) {
13491 int M = Mask[i + j];
13494 int &R = RepeatMask[j];
13495 if (0 != ((M % NumElts) / NumLaneElts))
13497 if (0 <= R && R != M)
13504 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13505 if (!FindRepeatingBroadcastMask(RepeatMask))
13508 // Shuffle the (lowest) repeated elements in place for broadcast.
13509 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13511 // Shuffle the actual broadcast.
13512 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13513 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13514 for (int j = 0; j != NumBroadcastElts; ++j)
13515 BroadcastMask[i + j] = j;
13516 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13521 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13522 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13525 // Bail if we already have a repeated lane shuffle mask.
13526 SmallVector<int, 8> RepeatedShuffleMask;
13527 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13530 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13531 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13532 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13533 int NumSubLanes = NumLanes * SubLaneScale;
13534 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13536 // Check that all the sources are coming from the same lane and see if we can
13537 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13538 // determine the source sub-lane for each destination sub-lane.
13539 int TopSrcSubLane = -1;
13540 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13541 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13542 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13543 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13545 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13546 // Extract the sub-lane mask, check that it all comes from the same lane
13547 // and normalize the mask entries to come from the first lane.
13549 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13550 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13551 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13554 int Lane = (M % NumElts) / NumLaneElts;
13555 if ((0 <= SrcLane) && (SrcLane != Lane))
13558 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13559 SubLaneMask[Elt] = LocalM;
13562 // Whole sub-lane is UNDEF.
13566 // Attempt to match against the candidate repeated sub-lane masks.
13567 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13568 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13569 for (int i = 0; i != NumSubLaneElts; ++i) {
13570 if (M1[i] < 0 || M2[i] < 0)
13572 if (M1[i] != M2[i])
13578 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13579 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13582 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13583 for (int i = 0; i != NumSubLaneElts; ++i) {
13584 int M = SubLaneMask[i];
13587 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13588 "Unexpected mask element");
13589 RepeatedSubLaneMask[i] = M;
13592 // Track the top most source sub-lane - by setting the remaining to UNDEF
13593 // we can greatly simplify shuffle matching.
13594 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13595 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13596 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13600 // Bail if we failed to find a matching repeated sub-lane mask.
13601 if (Dst2SrcSubLanes[DstSubLane] < 0)
13604 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13605 "Unexpected source lane");
13607 // Create a repeating shuffle mask for the entire vector.
13608 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13609 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13610 int Lane = SubLane / SubLaneScale;
13611 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13612 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13613 int M = RepeatedSubLaneMask[Elt];
13616 int Idx = (SubLane * NumSubLaneElts) + Elt;
13617 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13620 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13622 // Shuffle each source sub-lane to its destination.
13623 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13624 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13625 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13626 if (SrcSubLane < 0)
13628 for (int j = 0; j != NumSubLaneElts; ++j)
13629 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13632 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13636 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13637 unsigned &ShuffleImm,
13638 ArrayRef<int> Mask) {
13639 int NumElts = VT.getVectorNumElements();
13640 assert(VT.getScalarSizeInBits() == 64 &&
13641 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13642 "Unexpected data type for VSHUFPD");
13644 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13645 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13647 bool ShufpdMask = true;
13648 bool CommutableMask = true;
13649 for (int i = 0; i < NumElts; ++i) {
13650 if (Mask[i] == SM_SentinelUndef)
13654 int Val = (i & 6) + NumElts * (i & 1);
13655 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13656 if (Mask[i] < Val || Mask[i] > Val + 1)
13657 ShufpdMask = false;
13658 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13659 CommutableMask = false;
13660 ShuffleImm |= (Mask[i] % 2) << i;
13665 if (CommutableMask) {
13673 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13674 ArrayRef<int> Mask, SDValue V1,
13675 SDValue V2, SelectionDAG &DAG) {
13676 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13677 "Unexpected data type for VSHUFPD");
13679 unsigned Immediate = 0;
13680 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13683 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13684 DAG.getConstant(Immediate, DL, MVT::i8));
13687 /// Handle lowering of 4-lane 64-bit floating point shuffles.
13689 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13690 /// isn't available.
13691 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13692 const APInt &Zeroable,
13693 SDValue V1, SDValue V2,
13694 const X86Subtarget &Subtarget,
13695 SelectionDAG &DAG) {
13696 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13697 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13698 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13700 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13701 Zeroable, Subtarget, DAG))
13704 if (V2.isUndef()) {
13705 // Check for being able to broadcast a single element.
13706 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13707 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13710 // Use low duplicate instructions for masks that match their pattern.
13711 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13712 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13714 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13715 // Non-half-crossing single input shuffles can be lowered with an
13716 // interleaved permutation.
13717 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13718 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13719 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13720 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13723 // With AVX2 we have direct support for this permutation.
13724 if (Subtarget.hasAVX2())
13725 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13726 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13728 // Try to create an in-lane repeating shuffle mask and then shuffle the
13729 // results into the target lanes.
13730 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13731 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13734 // Otherwise, fall back.
13735 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13739 // Use dedicated unpack instructions for masks that match their pattern.
13741 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13744 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13745 Zeroable, Subtarget, DAG))
13748 // Check if the blend happens to exactly fit that of SHUFPD.
13750 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13753 // Try to create an in-lane repeating shuffle mask and then shuffle the
13754 // results into the target lanes.
13755 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13756 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13759 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13760 // shuffle. However, if we have AVX2 and either inputs are already in place,
13761 // we will be able to shuffle even across lanes the other input in a single
13762 // instruction so skip this pattern.
13763 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13764 isShuffleMaskInputInPlace(1, Mask))))
13765 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13766 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13768 // If we have VLX support, we can use VEXPAND.
13769 if (Subtarget.hasVLX())
13770 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13771 V1, V2, DAG, Subtarget))
13774 // If we have AVX2 then we always want to lower with a blend because an v4 we
13775 // can fully permute the elements.
13776 if (Subtarget.hasAVX2())
13777 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13780 // Otherwise fall back on generic lowering.
13781 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13784 /// Handle lowering of 4-lane 64-bit integer shuffles.
13786 /// This routine is only called when we have AVX2 and thus a reasonable
13787 /// instruction set for v4i64 shuffling..
13788 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13789 const APInt &Zeroable,
13790 SDValue V1, SDValue V2,
13791 const X86Subtarget &Subtarget,
13792 SelectionDAG &DAG) {
13793 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13794 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13795 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13796 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13798 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13799 Zeroable, Subtarget, DAG))
13802 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13803 Zeroable, Subtarget, DAG))
13806 // Check for being able to broadcast a single element.
13807 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13808 Mask, Subtarget, DAG))
13811 if (V2.isUndef()) {
13812 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13813 // can use lower latency instructions that will operate on both lanes.
13814 SmallVector<int, 2> RepeatedMask;
13815 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13816 SmallVector<int, 4> PSHUFDMask;
13817 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13818 return DAG.getBitcast(
13820 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13821 DAG.getBitcast(MVT::v8i32, V1),
13822 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13825 // AVX2 provides a direct instruction for permuting a single input across
13827 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13828 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13831 // Try to use shift instructions.
13832 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13833 Zeroable, Subtarget, DAG))
13836 // If we have VLX support, we can use VALIGN or VEXPAND.
13837 if (Subtarget.hasVLX()) {
13838 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13839 Mask, Subtarget, DAG))
13842 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13843 V1, V2, DAG, Subtarget))
13847 // Try to use PALIGNR.
13848 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13849 Mask, Subtarget, DAG))
13852 // Use dedicated unpack instructions for masks that match their pattern.
13854 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13857 // Try to create an in-lane repeating shuffle mask and then shuffle the
13858 // results into the target lanes.
13859 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13860 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13863 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13864 // shuffle. However, if we have AVX2 and either inputs are already in place,
13865 // we will be able to shuffle even across lanes the other input in a single
13866 // instruction so skip this pattern.
13867 if (!isShuffleMaskInputInPlace(0, Mask) &&
13868 !isShuffleMaskInputInPlace(1, Mask))
13869 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13870 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13873 // Otherwise fall back on generic blend lowering.
13874 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13878 /// Handle lowering of 8-lane 32-bit floating point shuffles.
13880 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13881 /// isn't available.
13882 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13883 const APInt &Zeroable,
13884 SDValue V1, SDValue V2,
13885 const X86Subtarget &Subtarget,
13886 SelectionDAG &DAG) {
13887 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13888 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13889 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13891 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13892 Zeroable, Subtarget, DAG))
13895 // Check for being able to broadcast a single element.
13896 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13897 Mask, Subtarget, DAG))
13900 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13901 // options to efficiently lower the shuffle.
13902 SmallVector<int, 4> RepeatedMask;
13903 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13904 assert(RepeatedMask.size() == 4 &&
13905 "Repeated masks must be half the mask width!");
13907 // Use even/odd duplicate instructions for masks that match their pattern.
13908 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13909 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13910 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13911 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13914 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13915 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13917 // Use dedicated unpack instructions for masks that match their pattern.
13919 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13922 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13923 // have already handled any direct blends.
13924 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13927 // Try to create an in-lane repeating shuffle mask and then shuffle the
13928 // results into the target lanes.
13929 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13930 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13933 // If we have a single input shuffle with different shuffle patterns in the
13934 // two 128-bit lanes use the variable mask to VPERMILPS.
13935 if (V2.isUndef()) {
13936 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13937 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13938 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13940 if (Subtarget.hasAVX2())
13941 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13943 // Otherwise, fall back.
13944 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13948 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13950 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13951 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13953 // If we have VLX support, we can use VEXPAND.
13954 if (Subtarget.hasVLX())
13955 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13956 V1, V2, DAG, Subtarget))
13959 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13960 // since after split we get a more efficient code using vpunpcklwd and
13961 // vpunpckhwd instrs than vblend.
13962 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13963 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13967 // If we have AVX2 then we always want to lower with a blend because at v8 we
13968 // can fully permute the elements.
13969 if (Subtarget.hasAVX2())
13970 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13973 // Otherwise fall back on generic lowering.
13974 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13977 /// Handle lowering of 8-lane 32-bit integer shuffles.
13979 /// This routine is only called when we have AVX2 and thus a reasonable
13980 /// instruction set for v8i32 shuffling..
13981 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13982 const APInt &Zeroable,
13983 SDValue V1, SDValue V2,
13984 const X86Subtarget &Subtarget,
13985 SelectionDAG &DAG) {
13986 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13987 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13988 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13989 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13991 // Whenever we can lower this as a zext, that instruction is strictly faster
13992 // than any alternative. It also allows us to fold memory operands into the
13993 // shuffle in many cases.
13994 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13995 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13998 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13999 // since after split we get a more efficient code than vblend by using
14000 // vpunpcklwd and vpunpckhwd instrs.
14001 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
14002 !Subtarget.hasAVX512())
14004 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
14007 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
14008 Zeroable, Subtarget, DAG))
14011 // Check for being able to broadcast a single element.
14012 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
14013 Mask, Subtarget, DAG))
14016 // If the shuffle mask is repeated in each 128-bit lane we can use more
14017 // efficient instructions that mirror the shuffles across the two 128-bit
14019 SmallVector<int, 4> RepeatedMask;
14020 bool Is128BitLaneRepeatedShuffle =
14021 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
14022 if (Is128BitLaneRepeatedShuffle) {
14023 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14025 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
14026 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14028 // Use dedicated unpack instructions for masks that match their pattern.
14030 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
14034 // Try to use shift instructions.
14035 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
14036 Zeroable, Subtarget, DAG))
14039 // If we have VLX support, we can use VALIGN or EXPAND.
14040 if (Subtarget.hasVLX()) {
14041 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
14042 Mask, Subtarget, DAG))
14045 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
14046 V1, V2, DAG, Subtarget))
14050 // Try to use byte rotation instructions.
14051 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14052 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14055 // Try to create an in-lane repeating shuffle mask and then shuffle the
14056 // results into the target lanes.
14057 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14058 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14061 // If the shuffle patterns aren't repeated but it is a single input, directly
14062 // generate a cross-lane VPERMD instruction.
14063 if (V2.isUndef()) {
14064 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14065 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
14068 // Assume that a single SHUFPS is faster than an alternative sequence of
14069 // multiple instructions (even if the CPU has a domain penalty).
14070 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14071 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14072 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
14073 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
14074 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
14075 CastV1, CastV2, DAG);
14076 return DAG.getBitcast(MVT::v8i32, ShufPS);
14079 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14081 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14082 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14085 // Otherwise fall back on generic blend lowering.
14086 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
14090 /// Handle lowering of 16-lane 16-bit integer shuffles.
14092 /// This routine is only called when we have AVX2 and thus a reasonable
14093 /// instruction set for v16i16 shuffling..
14094 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14095 const APInt &Zeroable,
14096 SDValue V1, SDValue V2,
14097 const X86Subtarget &Subtarget,
14098 SelectionDAG &DAG) {
14099 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14100 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14101 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14102 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
14104 // Whenever we can lower this as a zext, that instruction is strictly faster
14105 // than any alternative. It also allows us to fold memory operands into the
14106 // shuffle in many cases.
14107 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14108 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14111 // Check for being able to broadcast a single element.
14112 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
14113 Mask, Subtarget, DAG))
14116 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
14117 Zeroable, Subtarget, DAG))
14120 // Use dedicated unpack instructions for masks that match their pattern.
14122 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
14125 // Use dedicated pack instructions for masks that match their pattern.
14126 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
14130 // Try to use shift instructions.
14131 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
14132 Zeroable, Subtarget, DAG))
14135 // Try to use byte rotation instructions.
14136 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14137 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14140 // Try to create an in-lane repeating shuffle mask and then shuffle the
14141 // results into the target lanes.
14142 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14143 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14146 if (V2.isUndef()) {
14147 // There are no generalized cross-lane shuffle operations available on i16
14149 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
14150 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
14151 Mask, DAG, Subtarget);
14153 SmallVector<int, 8> RepeatedMask;
14154 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
14155 // As this is a single-input shuffle, the repeated mask should be
14156 // a strictly valid v8i16 mask that we can pass through to the v8i16
14157 // lowering to handle even the v16 case.
14158 return lowerV8I16GeneralSingleInputVectorShuffle(
14159 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
14163 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14164 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14167 // AVX512BWVL can lower to VPERMW.
14168 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14169 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
14171 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14173 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14174 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14177 // Otherwise fall back on generic lowering.
14178 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
14181 /// Handle lowering of 32-lane 8-bit integer shuffles.
14183 /// This routine is only called when we have AVX2 and thus a reasonable
14184 /// instruction set for v32i8 shuffling..
14185 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14186 const APInt &Zeroable,
14187 SDValue V1, SDValue V2,
14188 const X86Subtarget &Subtarget,
14189 SelectionDAG &DAG) {
14190 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14191 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14192 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14193 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14195 // Whenever we can lower this as a zext, that instruction is strictly faster
14196 // than any alternative. It also allows us to fold memory operands into the
14197 // shuffle in many cases.
14198 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14199 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14202 // Check for being able to broadcast a single element.
14203 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14204 Mask, Subtarget, DAG))
14207 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14208 Zeroable, Subtarget, DAG))
14211 // Use dedicated unpack instructions for masks that match their pattern.
14213 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14216 // Use dedicated pack instructions for masks that match their pattern.
14217 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14221 // Try to use shift instructions.
14222 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14223 Zeroable, Subtarget, DAG))
14226 // Try to use byte rotation instructions.
14227 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14228 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14231 // Try to create an in-lane repeating shuffle mask and then shuffle the
14232 // results into the target lanes.
14233 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14234 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14237 // There are no generalized cross-lane shuffle operations available on i8
14239 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14240 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14243 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14244 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14247 // AVX512VBMIVL can lower to VPERMB.
14248 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14249 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14251 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14253 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14254 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14257 // Otherwise fall back on generic lowering.
14258 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14261 /// High-level routine to lower various 256-bit x86 vector shuffles.
14263 /// This routine either breaks down the specific type of a 256-bit x86 vector
14264 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14265 /// together based on the available instructions.
14266 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14267 MVT VT, SDValue V1, SDValue V2,
14268 const APInt &Zeroable,
14269 const X86Subtarget &Subtarget,
14270 SelectionDAG &DAG) {
14271 // If we have a single input to the zero element, insert that into V1 if we
14272 // can do so cheaply.
14273 int NumElts = VT.getVectorNumElements();
14274 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14276 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14277 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14278 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14281 // Handle special cases where the lower or upper half is UNDEF.
14283 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14286 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14287 // can check for those subtargets here and avoid much of the subtarget
14288 // querying in the per-vector-type lowering routines. With AVX1 we have
14289 // essentially *zero* ability to manipulate a 256-bit vector with integer
14290 // types. Since we'll use floating point types there eventually, just
14291 // immediately cast everything to a float and operate entirely in that domain.
14292 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14293 int ElementBits = VT.getScalarSizeInBits();
14294 if (ElementBits < 32) {
14295 // No floating point type available, if we can't use the bit operations
14296 // for masking/blending then decompose into 128-bit vectors.
14298 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14300 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14302 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14305 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14306 VT.getVectorNumElements());
14307 V1 = DAG.getBitcast(FpVT, V1);
14308 V2 = DAG.getBitcast(FpVT, V2);
14309 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14312 switch (VT.SimpleTy) {
14314 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14316 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14318 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14320 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14322 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14324 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14327 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14331 /// Try to lower a vector shuffle as a 128-bit shuffles.
14332 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14333 ArrayRef<int> Mask,
14334 const APInt &Zeroable,
14335 SDValue V1, SDValue V2,
14336 const X86Subtarget &Subtarget,
14337 SelectionDAG &DAG) {
14338 assert(VT.getScalarSizeInBits() == 64 &&
14339 "Unexpected element type size for 128bit shuffle.");
14341 // To handle 256 bit vector requires VLX and most probably
14342 // function lowerV2X128VectorShuffle() is better solution.
14343 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14345 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
14346 SmallVector<int, 4> WidenedMask;
14347 if (!canWidenShuffleElements(Mask, WidenedMask))
14350 // Try to use an insert into a zero vector.
14351 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14352 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14353 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14354 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14355 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14356 DAG.getIntPtrConstant(0, DL));
14357 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14358 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14359 DAG.getIntPtrConstant(0, DL));
14362 // Check for patterns which can be matched with a single insert of a 256-bit
14364 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14365 {0, 1, 2, 3, 0, 1, 2, 3});
14366 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14367 {0, 1, 2, 3, 8, 9, 10, 11})) {
14368 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14369 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14370 OnlyUsesV1 ? V1 : V2,
14371 DAG.getIntPtrConstant(0, DL));
14372 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14373 DAG.getIntPtrConstant(4, DL));
14376 assert(WidenedMask.size() == 4);
14378 // See if this is an insertion of the lower 128-bits of V2 into V1.
14379 bool IsInsert = true;
14381 for (int i = 0; i < 4; ++i) {
14382 assert(WidenedMask[i] >= -1);
14383 if (WidenedMask[i] < 0)
14386 // Make sure all V1 subvectors are in place.
14387 if (WidenedMask[i] < 4) {
14388 if (WidenedMask[i] != i) {
14393 // Make sure we only have a single V2 index and its the lowest 128-bits.
14394 if (V2Index >= 0 || WidenedMask[i] != 4) {
14401 if (IsInsert && V2Index >= 0) {
14402 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14403 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14404 DAG.getIntPtrConstant(0, DL));
14405 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14408 // Try to lower to vshuf64x2/vshuf32x4.
14409 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14410 unsigned PermMask = 0;
14411 // Insure elements came from the same Op.
14412 for (int i = 0; i < 4; ++i) {
14413 assert(WidenedMask[i] >= -1);
14414 if (WidenedMask[i] < 0)
14417 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14418 unsigned OpIndex = i / 2;
14419 if (Ops[OpIndex].isUndef())
14421 else if (Ops[OpIndex] != Op)
14424 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14425 // bits defined by a vshuf64x2 instruction's immediate control byte.
14426 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14429 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14430 DAG.getConstant(PermMask, DL, MVT::i8));
14433 /// Handle lowering of 8-lane 64-bit floating point shuffles.
14434 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14435 const APInt &Zeroable,
14436 SDValue V1, SDValue V2,
14437 const X86Subtarget &Subtarget,
14438 SelectionDAG &DAG) {
14439 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14440 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14441 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14443 if (V2.isUndef()) {
14444 // Use low duplicate instructions for masks that match their pattern.
14445 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14446 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14448 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14449 // Non-half-crossing single input shuffles can be lowered with an
14450 // interleaved permutation.
14451 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14452 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14453 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14454 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14455 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14456 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14459 SmallVector<int, 4> RepeatedMask;
14460 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14461 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14462 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14465 if (SDValue Shuf128 =
14466 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14470 if (SDValue Unpck =
14471 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14474 // Check if the blend happens to exactly fit that of SHUFPD.
14476 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14479 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14480 V2, DAG, Subtarget))
14483 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14484 Zeroable, Subtarget, DAG))
14487 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14490 /// Handle lowering of 16-lane 32-bit floating point shuffles.
14491 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14492 const APInt &Zeroable,
14493 SDValue V1, SDValue V2,
14494 const X86Subtarget &Subtarget,
14495 SelectionDAG &DAG) {
14496 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14497 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14498 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14500 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14501 // options to efficiently lower the shuffle.
14502 SmallVector<int, 4> RepeatedMask;
14503 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14504 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14506 // Use even/odd duplicate instructions for masks that match their pattern.
14507 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14508 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14509 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14510 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14513 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14514 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14516 // Use dedicated unpack instructions for masks that match their pattern.
14517 if (SDValue Unpck =
14518 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14521 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14522 Zeroable, Subtarget, DAG))
14525 // Otherwise, fall back to a SHUFPS sequence.
14526 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14529 // If we have a single input shuffle with different shuffle patterns in the
14530 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14531 if (V2.isUndef() &&
14532 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14533 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14534 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14537 // If we have AVX512F support, we can use VEXPAND.
14538 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14539 V1, V2, DAG, Subtarget))
14542 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14545 /// Handle lowering of 8-lane 64-bit integer shuffles.
14546 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14547 const APInt &Zeroable,
14548 SDValue V1, SDValue V2,
14549 const X86Subtarget &Subtarget,
14550 SelectionDAG &DAG) {
14551 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14552 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14553 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14555 if (V2.isUndef()) {
14556 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14557 // can use lower latency instructions that will operate on all four
14559 SmallVector<int, 2> Repeated128Mask;
14560 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14561 SmallVector<int, 4> PSHUFDMask;
14562 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14563 return DAG.getBitcast(
14565 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14566 DAG.getBitcast(MVT::v16i32, V1),
14567 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14570 SmallVector<int, 4> Repeated256Mask;
14571 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14572 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14573 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14576 if (SDValue Shuf128 =
14577 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14578 V1, V2, Subtarget, DAG))
14581 // Try to use shift instructions.
14582 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14583 Zeroable, Subtarget, DAG))
14586 // Try to use VALIGN.
14587 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14588 Mask, Subtarget, DAG))
14591 // Try to use PALIGNR.
14592 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14593 Mask, Subtarget, DAG))
14596 if (SDValue Unpck =
14597 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14599 // If we have AVX512F support, we can use VEXPAND.
14600 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14601 V2, DAG, Subtarget))
14604 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14605 Zeroable, Subtarget, DAG))
14608 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14611 /// Handle lowering of 16-lane 32-bit integer shuffles.
14612 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14613 const APInt &Zeroable,
14614 SDValue V1, SDValue V2,
14615 const X86Subtarget &Subtarget,
14616 SelectionDAG &DAG) {
14617 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14618 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14619 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14621 // Whenever we can lower this as a zext, that instruction is strictly faster
14622 // than any alternative. It also allows us to fold memory operands into the
14623 // shuffle in many cases.
14624 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14625 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14628 // If the shuffle mask is repeated in each 128-bit lane we can use more
14629 // efficient instructions that mirror the shuffles across the four 128-bit
14631 SmallVector<int, 4> RepeatedMask;
14632 bool Is128BitLaneRepeatedShuffle =
14633 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14634 if (Is128BitLaneRepeatedShuffle) {
14635 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14637 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14638 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14640 // Use dedicated unpack instructions for masks that match their pattern.
14642 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14646 // Try to use shift instructions.
14647 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14648 Zeroable, Subtarget, DAG))
14651 // Try to use VALIGN.
14652 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14653 Mask, Subtarget, DAG))
14656 // Try to use byte rotation instructions.
14657 if (Subtarget.hasBWI())
14658 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14659 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14662 // Assume that a single SHUFPS is faster than using a permv shuffle.
14663 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14664 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14665 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14666 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14667 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14668 CastV1, CastV2, DAG);
14669 return DAG.getBitcast(MVT::v16i32, ShufPS);
14671 // If we have AVX512F support, we can use VEXPAND.
14672 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14673 V1, V2, DAG, Subtarget))
14676 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14677 Zeroable, Subtarget, DAG))
14679 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14682 /// Handle lowering of 32-lane 16-bit integer shuffles.
14683 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14684 const APInt &Zeroable,
14685 SDValue V1, SDValue V2,
14686 const X86Subtarget &Subtarget,
14687 SelectionDAG &DAG) {
14688 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14689 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14690 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14691 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14693 // Whenever we can lower this as a zext, that instruction is strictly faster
14694 // than any alternative. It also allows us to fold memory operands into the
14695 // shuffle in many cases.
14696 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14697 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14700 // Use dedicated unpack instructions for masks that match their pattern.
14702 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14705 // Try to use shift instructions.
14706 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14707 Zeroable, Subtarget, DAG))
14710 // Try to use byte rotation instructions.
14711 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14712 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14715 if (V2.isUndef()) {
14716 SmallVector<int, 8> RepeatedMask;
14717 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14718 // As this is a single-input shuffle, the repeated mask should be
14719 // a strictly valid v8i16 mask that we can pass through to the v8i16
14720 // lowering to handle even the v32 case.
14721 return lowerV8I16GeneralSingleInputVectorShuffle(
14722 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14726 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14727 Zeroable, Subtarget, DAG))
14730 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14731 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14734 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14737 /// Handle lowering of 64-lane 8-bit integer shuffles.
14738 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14739 const APInt &Zeroable,
14740 SDValue V1, SDValue V2,
14741 const X86Subtarget &Subtarget,
14742 SelectionDAG &DAG) {
14743 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14744 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14745 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14746 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14748 // Whenever we can lower this as a zext, that instruction is strictly faster
14749 // than any alternative. It also allows us to fold memory operands into the
14750 // shuffle in many cases.
14751 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14752 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14755 // Use dedicated unpack instructions for masks that match their pattern.
14757 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14760 // Try to use shift instructions.
14761 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14762 Zeroable, Subtarget, DAG))
14765 // Try to use byte rotation instructions.
14766 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14767 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14770 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14771 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14774 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14775 if (Subtarget.hasVBMI())
14776 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14778 // Try to create an in-lane repeating shuffle mask and then shuffle the
14779 // results into the target lanes.
14780 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14781 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14784 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14785 Zeroable, Subtarget, DAG))
14788 // FIXME: Implement direct support for this type!
14789 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14792 /// High-level routine to lower various 512-bit x86 vector shuffles.
14794 /// This routine either breaks down the specific type of a 512-bit x86 vector
14795 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14796 /// together based on the available instructions.
14797 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14798 MVT VT, SDValue V1, SDValue V2,
14799 const APInt &Zeroable,
14800 const X86Subtarget &Subtarget,
14801 SelectionDAG &DAG) {
14802 assert(Subtarget.hasAVX512() &&
14803 "Cannot lower 512-bit vectors w/ basic ISA!");
14805 // If we have a single input to the zero element, insert that into V1 if we
14806 // can do so cheaply.
14807 int NumElts = Mask.size();
14808 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14810 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14811 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14812 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14815 // Handle special cases where the lower or upper half is UNDEF.
14817 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14820 // Check for being able to broadcast a single element.
14821 if (SDValue Broadcast =
14822 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14825 // Dispatch to each element type for lowering. If we don't have support for
14826 // specific element type shuffles at 512 bits, immediately split them and
14827 // lower them. Each lowering routine of a given type is allowed to assume that
14828 // the requisite ISA extensions for that element type are available.
14829 switch (VT.SimpleTy) {
14831 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14833 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14835 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14837 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14839 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14841 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14844 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14848 // Lower vXi1 vector shuffles.
14849 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14850 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14851 // vector, shuffle and then truncate it back.
14852 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14853 MVT VT, SDValue V1, SDValue V2,
14854 const APInt &Zeroable,
14855 const X86Subtarget &Subtarget,
14856 SelectionDAG &DAG) {
14857 unsigned NumElts = Mask.size();
14859 // Try to recognize shuffles that are just padding a subvector with zeros.
14860 unsigned SubvecElts = 0;
14861 for (int i = 0; i != (int)NumElts; ++i) {
14862 if (Mask[i] >= 0 && Mask[i] != i)
14867 assert(SubvecElts != NumElts && "Identity shuffle?");
14869 // Clip to a power 2.
14870 SubvecElts = PowerOf2Floor(SubvecElts);
14872 // Make sure the number of zeroable bits in the top at least covers the bits
14873 // not covered by the subvector.
14874 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14875 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14876 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14877 V1, DAG.getIntPtrConstant(0, DL));
14878 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14879 getZeroVector(VT, Subtarget, DAG, DL),
14880 Extract, DAG.getIntPtrConstant(0, DL));
14884 assert(Subtarget.hasAVX512() &&
14885 "Cannot lower 512-bit vectors w/o basic ISA!");
14887 switch (VT.SimpleTy) {
14889 llvm_unreachable("Expected a vector of i1 elements");
14891 ExtVT = MVT::v2i64;
14894 ExtVT = MVT::v4i32;
14897 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14899 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14902 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14903 // 256-bit operation available.
14904 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14907 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14908 // 256-bit operation available.
14909 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14910 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14913 ExtVT = MVT::v64i8;
14917 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14918 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14920 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14921 // i1 was sign extended we can use X86ISD::CVT2MASK.
14922 int NumElems = VT.getVectorNumElements();
14923 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14924 (Subtarget.hasDQI() && (NumElems < 32)))
14925 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
14926 Shuffle, ISD::SETGT);
14928 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14931 /// Helper function that returns true if the shuffle mask should be
14932 /// commuted to improve canonicalization.
14933 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14934 int NumElements = Mask.size();
14936 int NumV1Elements = 0, NumV2Elements = 0;
14940 else if (M < NumElements)
14945 // Commute the shuffle as needed such that more elements come from V1 than
14946 // V2. This allows us to match the shuffle pattern strictly on how many
14947 // elements come from V1 without handling the symmetric cases.
14948 if (NumV2Elements > NumV1Elements)
14951 assert(NumV1Elements > 0 && "No V1 indices");
14953 if (NumV2Elements == 0)
14956 // When the number of V1 and V2 elements are the same, try to minimize the
14957 // number of uses of V2 in the low half of the vector. When that is tied,
14958 // ensure that the sum of indices for V1 is equal to or lower than the sum
14959 // indices for V2. When those are equal, try to ensure that the number of odd
14960 // indices for V1 is lower than the number of odd indices for V2.
14961 if (NumV1Elements == NumV2Elements) {
14962 int LowV1Elements = 0, LowV2Elements = 0;
14963 for (int M : Mask.slice(0, NumElements / 2))
14964 if (M >= NumElements)
14968 if (LowV2Elements > LowV1Elements)
14970 if (LowV2Elements == LowV1Elements) {
14971 int SumV1Indices = 0, SumV2Indices = 0;
14972 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14973 if (Mask[i] >= NumElements)
14975 else if (Mask[i] >= 0)
14977 if (SumV2Indices < SumV1Indices)
14979 if (SumV2Indices == SumV1Indices) {
14980 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14981 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14982 if (Mask[i] >= NumElements)
14983 NumV2OddIndices += i % 2;
14984 else if (Mask[i] >= 0)
14985 NumV1OddIndices += i % 2;
14986 if (NumV2OddIndices < NumV1OddIndices)
14995 /// Top-level lowering for x86 vector shuffles.
14997 /// This handles decomposition, canonicalization, and lowering of all x86
14998 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14999 /// above in helper routines. The canonicalization attempts to widen shuffles
15000 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
15001 /// s.t. only one of the two inputs needs to be tested, etc.
15002 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
15003 SelectionDAG &DAG) {
15004 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
15005 ArrayRef<int> Mask = SVOp->getMask();
15006 SDValue V1 = Op.getOperand(0);
15007 SDValue V2 = Op.getOperand(1);
15008 MVT VT = Op.getSimpleValueType();
15009 int NumElements = VT.getVectorNumElements();
15011 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
15013 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
15014 "Can't lower MMX shuffles");
15016 bool V1IsUndef = V1.isUndef();
15017 bool V2IsUndef = V2.isUndef();
15018 if (V1IsUndef && V2IsUndef)
15019 return DAG.getUNDEF(VT);
15021 // When we create a shuffle node we put the UNDEF node to second operand,
15022 // but in some cases the first operand may be transformed to UNDEF.
15023 // In this case we should just commute the node.
15025 return DAG.getCommutedVectorShuffle(*SVOp);
15027 // Check for non-undef masks pointing at an undef vector and make the masks
15028 // undef as well. This makes it easier to match the shuffle based solely on
15032 if (M >= NumElements) {
15033 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
15034 for (int &M : NewMask)
15035 if (M >= NumElements)
15037 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15040 // Check for illegal shuffle mask element index values.
15041 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
15042 assert(llvm::all_of(Mask,
15043 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
15044 "Out of bounds shuffle index");
15046 // We actually see shuffles that are entirely re-arrangements of a set of
15047 // zero inputs. This mostly happens while decomposing complex shuffles into
15048 // simple ones. Directly lower these as a buildvector of zeros.
15049 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
15050 if (Zeroable.isAllOnesValue())
15051 return getZeroVector(VT, Subtarget, DAG, DL);
15053 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
15055 // Create an alternative mask with info about zeroable elements.
15056 // Here we do not set undef elements as zeroable.
15057 SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
15059 assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
15060 for (int i = 0; i != NumElements; ++i)
15061 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
15062 ZeroableMask[i] = SM_SentinelZero;
15065 // Try to collapse shuffles into using a vector type with fewer elements but
15066 // wider element types. We cap this to not form integers or floating point
15067 // elements wider than 64 bits, but it might be interesting to form i128
15068 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
15069 SmallVector<int, 16> WidenedMask;
15070 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
15071 canWidenShuffleElements(ZeroableMask, WidenedMask)) {
15072 MVT NewEltVT = VT.isFloatingPoint()
15073 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
15074 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
15075 int NewNumElts = NumElements / 2;
15076 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
15077 // Make sure that the new vector type is legal. For example, v2f64 isn't
15079 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
15081 // Modify the new Mask to take all zeros from the all-zero vector.
15082 // Choose indices that are blend-friendly.
15083 bool UsedZeroVector = false;
15084 assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
15085 "V2's non-undef elements are used?!");
15086 for (int i = 0; i != NewNumElts; ++i)
15087 if (WidenedMask[i] == SM_SentinelZero) {
15088 WidenedMask[i] = i + NewNumElts;
15089 UsedZeroVector = true;
15091 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
15092 // some elements to be undef.
15093 if (UsedZeroVector)
15094 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
15096 V1 = DAG.getBitcast(NewVT, V1);
15097 V2 = DAG.getBitcast(NewVT, V2);
15098 return DAG.getBitcast(
15099 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
15103 // Commute the shuffle if it will improve canonicalization.
15104 if (canonicalizeShuffleMaskWithCommute(Mask))
15105 return DAG.getCommutedVectorShuffle(*SVOp);
15108 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
15111 // For each vector width, delegate to a specialized lowering routine.
15112 if (VT.is128BitVector())
15113 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15116 if (VT.is256BitVector())
15117 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15120 if (VT.is512BitVector())
15121 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15125 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15128 llvm_unreachable("Unimplemented!");
15131 /// Try to lower a VSELECT instruction to a vector shuffle.
15132 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
15133 const X86Subtarget &Subtarget,
15134 SelectionDAG &DAG) {
15135 SDValue Cond = Op.getOperand(0);
15136 SDValue LHS = Op.getOperand(1);
15137 SDValue RHS = Op.getOperand(2);
15139 MVT VT = Op.getSimpleValueType();
15141 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
15143 auto *CondBV = cast<BuildVectorSDNode>(Cond);
15145 // Only non-legal VSELECTs reach this lowering, convert those into generic
15146 // shuffles and re-use the shuffle lowering path for blends.
15147 SmallVector<int, 32> Mask;
15148 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
15149 SDValue CondElt = CondBV->getOperand(i);
15151 // We can't map undef to undef here. They have different meanings. Treat
15152 // as the same as zero.
15153 if (CondElt.isUndef() || isNullConstant(CondElt))
15157 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
15160 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
15161 // A vselect where all conditions and data are constants can be optimized into
15162 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
15163 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
15164 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
15165 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
15168 // Try to lower this to a blend-style vector shuffle. This can handle all
15169 // constant condition cases.
15170 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
15173 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
15174 // with patterns on the mask registers on AVX-512.
15175 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
15178 // Variable blends are only legal from SSE4.1 onward.
15179 if (!Subtarget.hasSSE41())
15183 MVT VT = Op.getSimpleValueType();
15185 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
15186 // into an i1 condition so that we can use the mask-based 512-bit blend
15188 if (VT.getSizeInBits() == 512) {
15189 SDValue Cond = Op.getOperand(0);
15190 // The vNi1 condition case should be handled above as it can be trivially
15192 assert(Cond.getValueType().getScalarSizeInBits() ==
15193 VT.getScalarSizeInBits() &&
15194 "Should have a size-matched integer condition!");
15195 // Build a mask by testing the condition against zero.
15196 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
15197 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
15198 getZeroVector(VT, Subtarget, DAG, dl),
15200 // Now return a new VSELECT using the mask.
15201 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
15204 // Only some types will be legal on some subtargets. If we can emit a legal
15205 // VSELECT-matching blend, return Op, and but if we need to expand, return
15207 switch (VT.SimpleTy) {
15209 // Most of the vector types have blends past SSE4.1.
15213 // The byte blends for AVX vectors were introduced only in AVX2.
15214 if (Subtarget.hasAVX2())
15220 case MVT::v16i16: {
15221 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
15222 MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
15223 SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
15224 SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
15225 SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
15226 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
15227 return DAG.getBitcast(VT, Select);
15232 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15233 MVT VT = Op.getSimpleValueType();
15236 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15239 if (VT.getSizeInBits() == 8) {
15240 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15241 Op.getOperand(0), Op.getOperand(1));
15242 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15245 if (VT == MVT::f32) {
15246 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15247 // the result back to FR32 register. It's only worth matching if the
15248 // result has a single use which is a store or a bitcast to i32. And in
15249 // the case of a store, it's not worth it if the index is a constant 0,
15250 // because a MOVSSmr can be used instead, which is smaller and faster.
15251 if (!Op.hasOneUse())
15253 SDNode *User = *Op.getNode()->use_begin();
15254 if ((User->getOpcode() != ISD::STORE ||
15255 isNullConstant(Op.getOperand(1))) &&
15256 (User->getOpcode() != ISD::BITCAST ||
15257 User->getValueType(0) != MVT::i32))
15259 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15260 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15262 return DAG.getBitcast(MVT::f32, Extract);
15265 if (VT == MVT::i32 || VT == MVT::i64) {
15266 // ExtractPS/pextrq works with constant index.
15267 if (isa<ConstantSDNode>(Op.getOperand(1)))
15274 /// Extract one bit from mask vector, like v16i1 or v8i1.
15275 /// AVX-512 feature.
15276 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15277 const X86Subtarget &Subtarget) {
15278 SDValue Vec = Op.getOperand(0);
15280 MVT VecVT = Vec.getSimpleValueType();
15281 SDValue Idx = Op.getOperand(1);
15282 MVT EltVT = Op.getSimpleValueType();
15284 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15285 "Unexpected vector type in ExtractBitFromMaskVector");
15287 // variable index can't be handled in mask registers,
15288 // extend vector to VR512/128
15289 if (!isa<ConstantSDNode>(Idx)) {
15290 unsigned NumElts = VecVT.getVectorNumElements();
15291 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15292 // than extending to 128/256bit.
15293 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15294 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15295 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15296 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15297 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15300 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15302 // If the kshift instructions of the correct width aren't natively supported
15303 // then we need to promote the vector to the native size to get the correct
15304 // zeroing behavior.
15305 if (VecVT.getVectorNumElements() < 16) {
15306 VecVT = MVT::v16i1;
15307 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15308 DAG.getUNDEF(VecVT), Vec,
15309 DAG.getIntPtrConstant(0, dl));
15312 // Extracts from element 0 are always allowed.
15314 // Use kshiftr instruction to move to the lower element.
15315 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15316 DAG.getConstant(IdxVal, dl, MVT::i8));
15319 // Shrink to v16i1 since that's always legal.
15320 if (VecVT.getVectorNumElements() > 16) {
15321 VecVT = MVT::v16i1;
15322 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15323 DAG.getIntPtrConstant(0, dl));
15326 // Convert to a bitcast+aext/trunc.
15327 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15328 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15332 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15333 SelectionDAG &DAG) const {
15335 SDValue Vec = Op.getOperand(0);
15336 MVT VecVT = Vec.getSimpleValueType();
15337 SDValue Idx = Op.getOperand(1);
15339 if (VecVT.getVectorElementType() == MVT::i1)
15340 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15342 if (!isa<ConstantSDNode>(Idx)) {
15343 // Its more profitable to go through memory (1 cycles throughput)
15344 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15345 // IACA tool was used to get performance estimation
15346 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15348 // example : extractelement <16 x i8> %a, i32 %i
15350 // Block Throughput: 3.00 Cycles
15351 // Throughput Bottleneck: Port5
15353 // | Num Of | Ports pressure in cycles | |
15354 // | Uops | 0 - DV | 5 | 6 | 7 | |
15355 // ---------------------------------------------
15356 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15357 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15358 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15359 // Total Num Of Uops: 4
15362 // Block Throughput: 1.00 Cycles
15363 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15365 // | | Ports pressure in cycles | |
15366 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15367 // ---------------------------------------------------------
15368 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15369 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15370 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15371 // Total Num Of Uops: 4
15376 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15378 // If this is a 256-bit vector result, first extract the 128-bit vector and
15379 // then extract the element from the 128-bit vector.
15380 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15381 // Get the 128-bit vector.
15382 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15383 MVT EltVT = VecVT.getVectorElementType();
15385 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15386 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15388 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15389 // this can be done with a mask.
15390 IdxVal &= ElemsPerChunk - 1;
15391 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15392 DAG.getConstant(IdxVal, dl, MVT::i32));
15395 assert(VecVT.is128BitVector() && "Unexpected vector length");
15397 MVT VT = Op.getSimpleValueType();
15399 if (VT.getSizeInBits() == 16) {
15400 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15401 // we're going to zero extend the register or fold the store (SSE41 only).
15402 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15403 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15404 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15405 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15406 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15408 // Transform it so it match pextrw which produces a 32-bit result.
15409 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15410 Op.getOperand(0), Op.getOperand(1));
15411 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15414 if (Subtarget.hasSSE41())
15415 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15418 // TODO: We only extract a single element from v16i8, we can probably afford
15419 // to be more aggressive here before using the default approach of spilling to
15421 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15422 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15423 int DWordIdx = IdxVal / 4;
15424 if (DWordIdx == 0) {
15425 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15426 DAG.getBitcast(MVT::v4i32, Vec),
15427 DAG.getIntPtrConstant(DWordIdx, dl));
15428 int ShiftVal = (IdxVal % 4) * 8;
15430 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15431 DAG.getConstant(ShiftVal, dl, MVT::i8));
15432 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15435 int WordIdx = IdxVal / 2;
15436 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15437 DAG.getBitcast(MVT::v8i16, Vec),
15438 DAG.getIntPtrConstant(WordIdx, dl));
15439 int ShiftVal = (IdxVal % 2) * 8;
15441 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15442 DAG.getConstant(ShiftVal, dl, MVT::i8));
15443 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15446 if (VT.getSizeInBits() == 32) {
15450 // SHUFPS the element to the lowest double word, then movss.
15451 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15452 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15453 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15454 DAG.getIntPtrConstant(0, dl));
15457 if (VT.getSizeInBits() == 64) {
15458 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15459 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15460 // to match extract_elt for f64.
15464 // UNPCKHPD the element to the lowest double word, then movsd.
15465 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15466 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15467 int Mask[2] = { 1, -1 };
15468 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15469 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15470 DAG.getIntPtrConstant(0, dl));
15476 /// Insert one bit to mask vector, like v16i1 or v8i1.
15477 /// AVX-512 feature.
15478 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15479 const X86Subtarget &Subtarget) {
15481 SDValue Vec = Op.getOperand(0);
15482 SDValue Elt = Op.getOperand(1);
15483 SDValue Idx = Op.getOperand(2);
15484 MVT VecVT = Vec.getSimpleValueType();
15486 if (!isa<ConstantSDNode>(Idx)) {
15487 // Non constant index. Extend source and destination,
15488 // insert element and then truncate the result.
15489 unsigned NumElts = VecVT.getVectorNumElements();
15490 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15491 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15492 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15493 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15494 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15495 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15498 // Copy into a k-register, extract to v1i1 and insert_subvector.
15499 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15501 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15505 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15506 SelectionDAG &DAG) const {
15507 MVT VT = Op.getSimpleValueType();
15508 MVT EltVT = VT.getVectorElementType();
15509 unsigned NumElts = VT.getVectorNumElements();
15511 if (EltVT == MVT::i1)
15512 return InsertBitToMaskVector(Op, DAG, Subtarget);
15515 SDValue N0 = Op.getOperand(0);
15516 SDValue N1 = Op.getOperand(1);
15517 SDValue N2 = Op.getOperand(2);
15518 if (!isa<ConstantSDNode>(N2))
15520 auto *N2C = cast<ConstantSDNode>(N2);
15521 unsigned IdxVal = N2C->getZExtValue();
15523 bool IsZeroElt = X86::isZeroNode(N1);
15524 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15526 // If we are inserting a element, see if we can do this more efficiently with
15527 // a blend shuffle with a rematerializable vector than a costly integer
15529 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15530 16 <= EltVT.getSizeInBits()) {
15531 SmallVector<int, 8> BlendMask;
15532 for (unsigned i = 0; i != NumElts; ++i)
15533 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15534 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15535 : getOnesVector(VT, DAG, dl);
15536 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15539 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15540 // into that, and then insert the subvector back into the result.
15541 if (VT.is256BitVector() || VT.is512BitVector()) {
15542 // With a 256-bit vector, we can insert into the zero element efficiently
15543 // using a blend if we have AVX or AVX2 and the right data type.
15544 if (VT.is256BitVector() && IdxVal == 0) {
15545 // TODO: It is worthwhile to cast integer to floating point and back
15546 // and incur a domain crossing penalty if that's what we'll end up
15547 // doing anyway after extracting to a 128-bit vector.
15548 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15549 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15550 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15551 N2 = DAG.getIntPtrConstant(1, dl);
15552 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15556 // Get the desired 128-bit vector chunk.
15557 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15559 // Insert the element into the desired chunk.
15560 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15561 assert(isPowerOf2_32(NumEltsIn128));
15562 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15563 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15565 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15566 DAG.getConstant(IdxIn128, dl, MVT::i32));
15568 // Insert the changed part back into the bigger vector
15569 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15571 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15573 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15574 // argument. SSE41 required for pinsrb.
15575 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15577 if (VT == MVT::v8i16) {
15578 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15579 Opc = X86ISD::PINSRW;
15581 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15582 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15583 Opc = X86ISD::PINSRB;
15586 if (N1.getValueType() != MVT::i32)
15587 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15588 if (N2.getValueType() != MVT::i32)
15589 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15590 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15593 if (Subtarget.hasSSE41()) {
15594 if (EltVT == MVT::f32) {
15595 // Bits [7:6] of the constant are the source select. This will always be
15596 // zero here. The DAG Combiner may combine an extract_elt index into
15597 // these bits. For example (insert (extract, 3), 2) could be matched by
15598 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15599 // Bits [5:4] of the constant are the destination select. This is the
15600 // value of the incoming immediate.
15601 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15602 // combine either bitwise AND or insert of float 0.0 to set these bits.
15604 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15605 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15606 // If this is an insertion of 32-bits into the low 32-bits of
15607 // a vector, we prefer to generate a blend with immediate rather
15608 // than an insertps. Blends are simpler operations in hardware and so
15609 // will always have equal or better performance than insertps.
15610 // But if optimizing for size and there's a load folding opportunity,
15611 // generate insertps because blendps does not have a 32-bit memory
15613 N2 = DAG.getIntPtrConstant(1, dl);
15614 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15615 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15617 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15618 // Create this as a scalar to vector..
15619 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15620 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15623 // PINSR* works with constant index.
15624 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15631 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15632 SelectionDAG &DAG) {
15634 MVT OpVT = Op.getSimpleValueType();
15636 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15638 if (X86::isZeroNode(Op.getOperand(0)))
15639 return getZeroVector(OpVT, Subtarget, DAG, dl);
15641 // If this is a 256-bit vector result, first insert into a 128-bit
15642 // vector and then insert into the 256-bit vector.
15643 if (!OpVT.is128BitVector()) {
15644 // Insert into a 128-bit vector.
15645 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15646 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15647 OpVT.getVectorNumElements() / SizeFactor);
15649 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15651 // Insert the 128-bit vector.
15652 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15654 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15656 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15657 if (OpVT == MVT::v4i32)
15660 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15661 return DAG.getBitcast(
15662 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15665 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15666 // simple superregister reference or explicit instructions to insert
15667 // the upper bits of a vector.
15668 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15669 SelectionDAG &DAG) {
15670 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15672 return insert1BitVector(Op, DAG, Subtarget);
15675 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15676 SelectionDAG &DAG) {
15677 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15678 "Only vXi1 extract_subvectors need custom lowering");
15681 SDValue Vec = Op.getOperand(0);
15682 SDValue Idx = Op.getOperand(1);
15684 if (!isa<ConstantSDNode>(Idx))
15687 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15688 if (IdxVal == 0) // the operation is legal
15691 MVT VecVT = Vec.getSimpleValueType();
15692 unsigned NumElems = VecVT.getVectorNumElements();
15694 // Extend to natively supported kshift.
15695 MVT WideVecVT = VecVT;
15696 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15697 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15698 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15699 DAG.getUNDEF(WideVecVT), Vec,
15700 DAG.getIntPtrConstant(0, dl));
15703 // Shift to the LSB.
15704 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15705 DAG.getConstant(IdxVal, dl, MVT::i8));
15707 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15708 DAG.getIntPtrConstant(0, dl));
15711 // Returns the appropriate wrapper opcode for a global reference.
15712 unsigned X86TargetLowering::getGlobalWrapperKind(
15713 const GlobalValue *GV, const unsigned char OpFlags) const {
15714 // References to absolute symbols are never PC-relative.
15715 if (GV && GV->isAbsoluteSymbolRef())
15716 return X86ISD::Wrapper;
15718 CodeModel::Model M = getTargetMachine().getCodeModel();
15719 if (Subtarget.isPICStyleRIPRel() &&
15720 (M == CodeModel::Small || M == CodeModel::Kernel))
15721 return X86ISD::WrapperRIP;
15723 // GOTPCREL references must always use RIP.
15724 if (OpFlags == X86II::MO_GOTPCREL)
15725 return X86ISD::WrapperRIP;
15727 return X86ISD::Wrapper;
15730 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15731 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15732 // one of the above mentioned nodes. It has to be wrapped because otherwise
15733 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15734 // be used to form addressing mode. These wrapped nodes will be selected
15737 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15738 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15740 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15741 // global base reg.
15742 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15744 auto PtrVT = getPointerTy(DAG.getDataLayout());
15745 SDValue Result = DAG.getTargetConstantPool(
15746 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15748 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15749 // With PIC, the address is actually $g + Offset.
15752 DAG.getNode(ISD::ADD, DL, PtrVT,
15753 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15759 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15760 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15762 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15763 // global base reg.
15764 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15766 auto PtrVT = getPointerTy(DAG.getDataLayout());
15767 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15769 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15771 // With PIC, the address is actually $g + Offset.
15774 DAG.getNode(ISD::ADD, DL, PtrVT,
15775 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15781 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15782 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15784 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15785 // global base reg.
15786 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15787 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15789 auto PtrVT = getPointerTy(DAG.getDataLayout());
15790 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15793 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15795 // With PIC, the address is actually $g + Offset.
15796 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15798 DAG.getNode(ISD::ADD, DL, PtrVT,
15799 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15802 // For symbols that require a load from a stub to get the address, emit the
15804 if (isGlobalStubReference(OpFlag))
15805 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15806 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15812 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15813 // Create the TargetBlockAddressAddress node.
15814 unsigned char OpFlags =
15815 Subtarget.classifyBlockAddressReference();
15816 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15817 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15819 auto PtrVT = getPointerTy(DAG.getDataLayout());
15820 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15821 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15823 // With PIC, the address is actually $g + Offset.
15824 if (isGlobalRelativeToPICBase(OpFlags)) {
15825 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15826 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15832 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15833 const SDLoc &dl, int64_t Offset,
15834 SelectionDAG &DAG) const {
15835 // Create the TargetGlobalAddress node, folding in the constant
15836 // offset if it is legal.
15837 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15838 CodeModel::Model M = DAG.getTarget().getCodeModel();
15839 auto PtrVT = getPointerTy(DAG.getDataLayout());
15841 if (OpFlags == X86II::MO_NO_FLAG &&
15842 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15843 // A direct static reference to a global.
15844 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15847 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15850 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
15852 // With PIC, the address is actually $g + Offset.
15853 if (isGlobalRelativeToPICBase(OpFlags)) {
15854 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15855 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15858 // For globals that require a load from a stub to get the address, emit the
15860 if (isGlobalStubReference(OpFlags))
15861 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15862 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15864 // If there was a non-zero offset that we didn't fold, create an explicit
15865 // addition for it.
15867 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15868 DAG.getConstant(Offset, dl, PtrVT));
15874 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15875 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15876 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15877 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15881 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15882 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15883 unsigned char OperandFlags, bool LocalDynamic = false) {
15884 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15885 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15887 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15888 GA->getValueType(0),
15892 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15896 SDValue Ops[] = { Chain, TGA, *InFlag };
15897 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15899 SDValue Ops[] = { Chain, TGA };
15900 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15903 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15904 MFI.setAdjustsStack(true);
15905 MFI.setHasCalls(true);
15907 SDValue Flag = Chain.getValue(1);
15908 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15911 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15913 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15916 SDLoc dl(GA); // ? function entry point might be better
15917 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15918 DAG.getNode(X86ISD::GlobalBaseReg,
15919 SDLoc(), PtrVT), InFlag);
15920 InFlag = Chain.getValue(1);
15922 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15925 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15927 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15929 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15930 X86::RAX, X86II::MO_TLSGD);
15933 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15939 // Get the start address of the TLS block for this module.
15940 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15941 .getInfo<X86MachineFunctionInfo>();
15942 MFI->incNumLocalDynamicTLSAccesses();
15946 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15947 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15950 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15951 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15952 InFlag = Chain.getValue(1);
15953 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15954 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15957 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15961 unsigned char OperandFlags = X86II::MO_DTPOFF;
15962 unsigned WrapperKind = X86ISD::Wrapper;
15963 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15964 GA->getValueType(0),
15965 GA->getOffset(), OperandFlags);
15966 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15968 // Add x@dtpoff with the base.
15969 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15972 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15973 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15974 const EVT PtrVT, TLSModel::Model model,
15975 bool is64Bit, bool isPIC) {
15978 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15979 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15980 is64Bit ? 257 : 256));
15982 SDValue ThreadPointer =
15983 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15984 MachinePointerInfo(Ptr));
15986 unsigned char OperandFlags = 0;
15987 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15989 unsigned WrapperKind = X86ISD::Wrapper;
15990 if (model == TLSModel::LocalExec) {
15991 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15992 } else if (model == TLSModel::InitialExec) {
15994 OperandFlags = X86II::MO_GOTTPOFF;
15995 WrapperKind = X86ISD::WrapperRIP;
15997 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
16000 llvm_unreachable("Unexpected model");
16003 // emit "addl x@ntpoff,%eax" (local exec)
16004 // or "addl x@indntpoff,%eax" (initial exec)
16005 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
16007 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
16008 GA->getOffset(), OperandFlags);
16009 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
16011 if (model == TLSModel::InitialExec) {
16012 if (isPIC && !is64Bit) {
16013 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
16014 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16018 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
16019 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
16022 // The address of the thread local variable is the add of the thread
16023 // pointer with the offset of the variable.
16024 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
16028 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
16030 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
16032 if (DAG.getTarget().useEmulatedTLS())
16033 return LowerToTLSEmulatedModel(GA, DAG);
16035 const GlobalValue *GV = GA->getGlobal();
16036 auto PtrVT = getPointerTy(DAG.getDataLayout());
16037 bool PositionIndependent = isPositionIndependent();
16039 if (Subtarget.isTargetELF()) {
16040 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
16042 case TLSModel::GeneralDynamic:
16043 if (Subtarget.is64Bit())
16044 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
16045 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
16046 case TLSModel::LocalDynamic:
16047 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
16048 Subtarget.is64Bit());
16049 case TLSModel::InitialExec:
16050 case TLSModel::LocalExec:
16051 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
16052 PositionIndependent);
16054 llvm_unreachable("Unknown TLS model.");
16057 if (Subtarget.isTargetDarwin()) {
16058 // Darwin only has one model of TLS. Lower to that.
16059 unsigned char OpFlag = 0;
16060 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
16061 X86ISD::WrapperRIP : X86ISD::Wrapper;
16063 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16064 // global base reg.
16065 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
16067 OpFlag = X86II::MO_TLVP_PIC_BASE;
16069 OpFlag = X86II::MO_TLVP;
16071 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
16072 GA->getValueType(0),
16073 GA->getOffset(), OpFlag);
16074 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
16076 // With PIC32, the address is actually $g + Offset.
16078 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
16079 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16082 // Lowering the machine isd will make sure everything is in the right
16084 SDValue Chain = DAG.getEntryNode();
16085 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16086 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16087 SDValue Args[] = { Chain, Offset };
16088 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
16089 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
16090 DAG.getIntPtrConstant(0, DL, true),
16091 Chain.getValue(1), DL);
16093 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
16094 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16095 MFI.setAdjustsStack(true);
16097 // And our return value (tls address) is in the standard call return value
16099 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
16100 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
16103 if (Subtarget.isTargetKnownWindowsMSVC() ||
16104 Subtarget.isTargetWindowsItanium() ||
16105 Subtarget.isTargetWindowsGNU()) {
16106 // Just use the implicit TLS architecture
16107 // Need to generate something similar to:
16108 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
16110 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
16111 // mov rcx, qword [rdx+rcx*8]
16112 // mov eax, .tls$:tlsvar
16113 // [rax+rcx] contains the address
16114 // Windows 64bit: gs:0x58
16115 // Windows 32bit: fs:__tls_array
16118 SDValue Chain = DAG.getEntryNode();
16120 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
16121 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
16122 // use its literal value of 0x2C.
16123 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
16124 ? Type::getInt8PtrTy(*DAG.getContext(),
16126 : Type::getInt32PtrTy(*DAG.getContext(),
16129 SDValue TlsArray = Subtarget.is64Bit()
16130 ? DAG.getIntPtrConstant(0x58, dl)
16131 : (Subtarget.isTargetWindowsGNU()
16132 ? DAG.getIntPtrConstant(0x2C, dl)
16133 : DAG.getExternalSymbol("_tls_array", PtrVT));
16135 SDValue ThreadPointer =
16136 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
16139 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
16140 res = ThreadPointer;
16142 // Load the _tls_index variable
16143 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
16144 if (Subtarget.is64Bit())
16145 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
16146 MachinePointerInfo(), MVT::i32);
16148 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
16150 auto &DL = DAG.getDataLayout();
16152 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
16153 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
16155 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
16158 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
16160 // Get the offset of start of .tls section
16161 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16162 GA->getValueType(0),
16163 GA->getOffset(), X86II::MO_SECREL);
16164 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
16166 // The address of the thread local variable is the add of the thread
16167 // pointer with the offset of the variable.
16168 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
16171 llvm_unreachable("TLS not implemented for this target.");
16174 /// Lower SRA_PARTS and friends, which return two i32 values
16175 /// and take a 2 x i32 value to shift plus a shift amount.
16176 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
16177 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
16178 MVT VT = Op.getSimpleValueType();
16179 unsigned VTBits = VT.getSizeInBits();
16181 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
16182 SDValue ShOpLo = Op.getOperand(0);
16183 SDValue ShOpHi = Op.getOperand(1);
16184 SDValue ShAmt = Op.getOperand(2);
16185 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
16186 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
16188 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16189 DAG.getConstant(VTBits - 1, dl, MVT::i8));
16190 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
16191 DAG.getConstant(VTBits - 1, dl, MVT::i8))
16192 : DAG.getConstant(0, dl, VT);
16194 SDValue Tmp2, Tmp3;
16195 if (Op.getOpcode() == ISD::SHL_PARTS) {
16196 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
16197 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
16199 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
16200 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
16203 // If the shift amount is larger or equal than the width of a part we can't
16204 // rely on the results of shld/shrd. Insert a test and select the appropriate
16205 // values for large shift amounts.
16206 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16207 DAG.getConstant(VTBits, dl, MVT::i8));
16208 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
16209 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
16212 if (Op.getOpcode() == ISD::SHL_PARTS) {
16213 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16214 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16216 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16217 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16220 return DAG.getMergeValues({ Lo, Hi }, dl);
16223 // Try to use a packed vector operation to handle i64 on 32-bit targets when
16224 // AVX512DQ is enabled.
16225 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
16226 const X86Subtarget &Subtarget) {
16227 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
16228 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
16229 SDValue Src = Op.getOperand(0);
16230 MVT SrcVT = Src.getSimpleValueType();
16231 MVT VT = Op.getSimpleValueType();
16233 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
16234 (VT != MVT::f32 && VT != MVT::f64))
16237 // Pack the i64 into a vector, do the operation and extract.
16239 // Using 256-bit to ensure result is 128-bits for f32 case.
16240 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
16241 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
16242 MVT VecVT = MVT::getVectorVT(VT, NumElts);
16245 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
16246 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
16247 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
16248 DAG.getIntPtrConstant(0, dl));
16251 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
16252 SelectionDAG &DAG) const {
16253 SDValue Src = Op.getOperand(0);
16254 MVT SrcVT = Src.getSimpleValueType();
16255 MVT VT = Op.getSimpleValueType();
16258 if (SrcVT.isVector()) {
16259 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16260 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16261 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16262 DAG.getUNDEF(SrcVT)));
16267 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16268 "Unknown SINT_TO_FP to lower!");
16270 // These are really Legal; return the operand so the caller accepts it as
16272 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
16274 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
16278 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16281 SDValue ValueToStore = Op.getOperand(0);
16282 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
16283 !Subtarget.is64Bit())
16284 // Bitcasting to f64 here allows us to do a single 64-bit store from
16285 // an SSE register, avoiding the store forwarding penalty that would come
16286 // with two 32-bit stores.
16287 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16289 unsigned Size = SrcVT.getSizeInBits()/8;
16290 MachineFunction &MF = DAG.getMachineFunction();
16291 auto PtrVT = getPointerTy(MF.getDataLayout());
16292 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16293 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16294 SDValue Chain = DAG.getStore(
16295 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16296 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16297 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16300 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16302 SelectionDAG &DAG) const {
16306 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16308 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16310 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16312 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16314 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16315 MachineMemOperand *MMO;
16317 int SSFI = FI->getIndex();
16318 MMO = DAG.getMachineFunction().getMachineMemOperand(
16319 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16320 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16322 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16323 StackSlot = StackSlot.getOperand(1);
16325 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16326 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16328 Tys, Ops, SrcVT, MMO);
16331 Chain = Result.getValue(1);
16332 SDValue InFlag = Result.getValue(2);
16334 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16335 // shouldn't be necessary except that RFP cannot be live across
16336 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16337 MachineFunction &MF = DAG.getMachineFunction();
16338 unsigned SSFISize = Op.getValueSizeInBits()/8;
16339 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16340 auto PtrVT = getPointerTy(MF.getDataLayout());
16341 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16342 Tys = DAG.getVTList(MVT::Other);
16344 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16346 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16347 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16348 MachineMemOperand::MOStore, SSFISize, SSFISize);
16350 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16351 Ops, Op.getValueType(), MMO);
16352 Result = DAG.getLoad(
16353 Op.getValueType(), DL, Chain, StackSlot,
16354 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16360 /// 64-bit unsigned integer to double expansion.
16361 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16362 const X86Subtarget &Subtarget) {
16363 // This algorithm is not obvious. Here it is what we're trying to output:
16366 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16367 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16369 haddpd %xmm0, %xmm0
16371 pshufd $0x4e, %xmm0, %xmm1
16377 LLVMContext *Context = DAG.getContext();
16379 // Build some magic constants.
16380 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16381 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16382 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16383 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16385 SmallVector<Constant*,2> CV1;
16387 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16388 APInt(64, 0x4330000000000000ULL))));
16390 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16391 APInt(64, 0x4530000000000000ULL))));
16392 Constant *C1 = ConstantVector::get(CV1);
16393 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16395 // Load the 64-bit value into an XMM register.
16396 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16399 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16400 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16401 /* Alignment = */ 16);
16403 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16406 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16407 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16408 /* Alignment = */ 16);
16409 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16410 // TODO: Are there any fast-math-flags to propagate here?
16411 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16414 if (Subtarget.hasSSE3()) {
16415 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16416 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16418 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16419 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16420 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16421 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16424 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16425 DAG.getIntPtrConstant(0, dl));
16428 /// 32-bit unsigned integer to float expansion.
16429 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16430 const X86Subtarget &Subtarget) {
16432 // FP constant to bias correct the final result.
16433 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16436 // Load the 32-bit value into an XMM register.
16437 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16440 // Zero out the upper parts of the register.
16441 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16443 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16444 DAG.getBitcast(MVT::v2f64, Load),
16445 DAG.getIntPtrConstant(0, dl));
16447 // Or the load with the bias.
16448 SDValue Or = DAG.getNode(
16449 ISD::OR, dl, MVT::v2i64,
16450 DAG.getBitcast(MVT::v2i64,
16451 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16452 DAG.getBitcast(MVT::v2i64,
16453 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16455 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16456 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16458 // Subtract the bias.
16459 // TODO: Are there any fast-math-flags to propagate here?
16460 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16462 // Handle final rounding.
16463 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16466 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16467 const X86Subtarget &Subtarget,
16469 if (Op.getSimpleValueType() != MVT::v2f64)
16472 SDValue N0 = Op.getOperand(0);
16473 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16475 // Legalize to v4i32 type.
16476 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16477 DAG.getUNDEF(MVT::v2i32));
16479 if (Subtarget.hasAVX512())
16480 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16482 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16483 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16484 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16485 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16487 // Two to the power of half-word-size.
16488 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16490 // Clear upper part of LO, lower HI.
16491 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16492 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16494 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16495 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16496 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16498 // Add the two halves.
16499 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16502 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16503 const X86Subtarget &Subtarget) {
16504 // The algorithm is the following:
16505 // #ifdef __SSE4_1__
16506 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16507 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16508 // (uint4) 0x53000000, 0xaa);
16510 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16511 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16513 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16514 // return (float4) lo + fhi;
16516 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16517 // reassociate the two FADDs, and if we do that, the algorithm fails
16518 // spectacularly (PR24512).
16519 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16520 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16521 // there's also the MachineCombiner reassociations happening on Machine IR.
16522 if (DAG.getTarget().Options.UnsafeFPMath)
16526 SDValue V = Op->getOperand(0);
16527 MVT VecIntVT = V.getSimpleValueType();
16528 bool Is128 = VecIntVT == MVT::v4i32;
16529 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16530 // If we convert to something else than the supported type, e.g., to v4f64,
16532 if (VecFloatVT != Op->getSimpleValueType(0))
16535 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16536 "Unsupported custom type");
16538 // In the #idef/#else code, we have in common:
16539 // - The vector of constants:
16545 // Create the splat vector for 0x4b000000.
16546 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16547 // Create the splat vector for 0x53000000.
16548 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16550 // Create the right shift.
16551 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16552 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16555 if (Subtarget.hasSSE41()) {
16556 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16557 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16558 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16559 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16560 // Low will be bitcasted right away, so do not bother bitcasting back to its
16562 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16563 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16564 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16565 // (uint4) 0x53000000, 0xaa);
16566 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16567 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16568 // High will be bitcasted right away, so do not bother bitcasting back to
16569 // its original type.
16570 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16571 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16573 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16574 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16575 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16576 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16578 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16579 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16582 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16583 SDValue VecCstFAdd = DAG.getConstantFP(
16584 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16586 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16587 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16588 // TODO: Are there any fast-math-flags to propagate here?
16590 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16591 // return (float4) lo + fhi;
16592 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16593 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16596 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16597 const X86Subtarget &Subtarget) {
16598 SDValue N0 = Op.getOperand(0);
16599 MVT SrcVT = N0.getSimpleValueType();
16602 switch (SrcVT.SimpleTy) {
16604 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16606 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16609 assert(!Subtarget.hasAVX512());
16610 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16614 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16615 SelectionDAG &DAG) const {
16616 SDValue N0 = Op.getOperand(0);
16618 auto PtrVT = getPointerTy(DAG.getDataLayout());
16620 if (Op.getSimpleValueType().isVector())
16621 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16623 MVT SrcVT = N0.getSimpleValueType();
16624 MVT DstVT = Op.getSimpleValueType();
16626 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16627 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16628 // Conversions from unsigned i32 to f32/f64 are legal,
16629 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16633 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16636 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16637 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16638 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16639 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16640 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16643 // Make a 64-bit buffer, and use it to build an FILD.
16644 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16645 if (SrcVT == MVT::i32) {
16646 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16647 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16648 StackSlot, MachinePointerInfo());
16649 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16650 OffsetSlot, MachinePointerInfo());
16651 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16655 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16656 SDValue ValueToStore = Op.getOperand(0);
16657 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16658 // Bitcasting to f64 here allows us to do a single 64-bit store from
16659 // an SSE register, avoiding the store forwarding penalty that would come
16660 // with two 32-bit stores.
16661 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16662 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16663 MachinePointerInfo());
16664 // For i64 source, we need to add the appropriate power of 2 if the input
16665 // was negative. This is the same as the optimization in
16666 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16667 // we must be careful to do the computation in x87 extended precision, not
16668 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16669 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16670 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16671 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16672 MachineMemOperand::MOLoad, 8, 8);
16674 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16675 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16676 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16679 APInt FF(32, 0x5F800000ULL);
16681 // Check whether the sign bit is set.
16682 SDValue SignSet = DAG.getSetCC(
16683 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16684 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16686 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16687 SDValue FudgePtr = DAG.getConstantPool(
16688 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16690 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16691 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16692 SDValue Four = DAG.getIntPtrConstant(4, dl);
16693 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16694 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16696 // Load the value out, extending it from f32 to f80.
16697 // FIXME: Avoid the extend by constructing the right constant pool?
16698 SDValue Fudge = DAG.getExtLoad(
16699 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16700 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16701 /* Alignment = */ 4);
16702 // Extend everything to 80 bits to force it to be done on x87.
16703 // TODO: Are there any fast-math-flags to propagate here?
16704 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16705 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16706 DAG.getIntPtrConstant(0, dl));
16709 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16710 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16711 // just return an <SDValue(), SDValue()> pair.
16712 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16713 // to i16, i32 or i64, and we lower it to a legal sequence.
16714 // If lowered to the final integer result we return a <result, SDValue()> pair.
16715 // Otherwise we lower it to a sequence ending with a FIST, return a
16716 // <FIST, StackSlot> pair, and the caller is responsible for loading
16717 // the final integer result from StackSlot.
16718 std::pair<SDValue,SDValue>
16719 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16720 bool IsSigned, bool IsReplace) const {
16723 EVT DstTy = Op.getValueType();
16724 EVT TheVT = Op.getOperand(0).getValueType();
16725 auto PtrVT = getPointerTy(DAG.getDataLayout());
16727 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16728 // f16 must be promoted before using the lowering in this routine.
16729 // fp128 does not use this lowering.
16730 return std::make_pair(SDValue(), SDValue());
16733 // If using FIST to compute an unsigned i64, we'll need some fixup
16734 // to handle values above the maximum signed i64. A FIST is always
16735 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16736 bool UnsignedFixup = !IsSigned &&
16737 DstTy == MVT::i64 &&
16738 (!Subtarget.is64Bit() ||
16739 !isScalarFPTypeInSSEReg(TheVT));
16741 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16742 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16743 // The low 32 bits of the fist result will have the correct uint32 result.
16744 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16748 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16749 DstTy.getSimpleVT() >= MVT::i16 &&
16750 "Unknown FP_TO_INT to lower!");
16752 // These are really Legal.
16753 if (DstTy == MVT::i32 &&
16754 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16755 return std::make_pair(SDValue(), SDValue());
16756 if (Subtarget.is64Bit() &&
16757 DstTy == MVT::i64 &&
16758 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16759 return std::make_pair(SDValue(), SDValue());
16761 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16763 MachineFunction &MF = DAG.getMachineFunction();
16764 unsigned MemSize = DstTy.getSizeInBits()/8;
16765 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16766 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16769 switch (DstTy.getSimpleVT().SimpleTy) {
16770 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16771 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16772 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16773 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16776 SDValue Chain = DAG.getEntryNode();
16777 SDValue Value = Op.getOperand(0);
16778 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16780 if (UnsignedFixup) {
16782 // Conversion to unsigned i64 is implemented with a select,
16783 // depending on whether the source value fits in the range
16784 // of a signed i64. Let Thresh be the FP equivalent of
16785 // 0x8000000000000000ULL.
16787 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16788 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16789 // Fist-to-mem64 FistSrc
16790 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16791 // to XOR'ing the high 32 bits with Adjust.
16793 // Being a power of 2, Thresh is exactly representable in all FP formats.
16794 // For X87 we'd like to use the smallest FP type for this constant, but
16795 // for DAG type consistency we have to match the FP operand type.
16797 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16798 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16799 bool LosesInfo = false;
16800 if (TheVT == MVT::f64)
16801 // The rounding mode is irrelevant as the conversion should be exact.
16802 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16804 else if (TheVT == MVT::f80)
16805 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16806 APFloat::rmNearestTiesToEven, &LosesInfo);
16808 assert(Status == APFloat::opOK && !LosesInfo &&
16809 "FP conversion should have been exact");
16811 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16813 SDValue Cmp = DAG.getSetCC(DL,
16814 getSetCCResultType(DAG.getDataLayout(),
16815 *DAG.getContext(), TheVT),
16816 Value, ThreshVal, ISD::SETLT);
16817 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16818 DAG.getConstant(0, DL, MVT::i32),
16819 DAG.getConstant(0x80000000, DL, MVT::i32));
16820 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16821 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16822 *DAG.getContext(), TheVT),
16823 Value, ThreshVal, ISD::SETLT);
16824 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16827 // FIXME This causes a redundant load/store if the SSE-class value is already
16828 // in memory, such as if it is on the callstack.
16829 if (isScalarFPTypeInSSEReg(TheVT)) {
16830 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16831 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16832 MachinePointerInfo::getFixedStack(MF, SSFI));
16833 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16835 Chain, StackSlot, DAG.getValueType(TheVT)
16838 MachineMemOperand *MMO =
16839 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16840 MachineMemOperand::MOLoad, MemSize, MemSize);
16841 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16842 Chain = Value.getValue(1);
16843 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16844 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16847 MachineMemOperand *MMO =
16848 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16849 MachineMemOperand::MOStore, MemSize, MemSize);
16851 if (UnsignedFixup) {
16853 // Insert the FIST, load its result as two i32's,
16854 // and XOR the high i32 with Adjust.
16856 SDValue FistOps[] = { Chain, Value, StackSlot };
16857 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16858 FistOps, DstTy, MMO);
16861 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16862 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16865 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16866 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16868 if (Subtarget.is64Bit()) {
16869 // Join High32 and Low32 into a 64-bit result.
16870 // (High32 << 32) | Low32
16871 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16872 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16873 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16874 DAG.getConstant(32, DL, MVT::i8));
16875 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16876 return std::make_pair(Result, SDValue());
16879 SDValue ResultOps[] = { Low32, High32 };
16881 SDValue pair = IsReplace
16882 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16883 : DAG.getMergeValues(ResultOps, DL);
16884 return std::make_pair(pair, SDValue());
16886 // Build the FP_TO_INT*_IN_MEM
16887 SDValue Ops[] = { Chain, Value, StackSlot };
16888 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16890 return std::make_pair(FIST, StackSlot);
16894 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16895 const X86Subtarget &Subtarget) {
16896 MVT VT = Op->getSimpleValueType(0);
16897 SDValue In = Op->getOperand(0);
16898 MVT InVT = In.getSimpleValueType();
16901 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16902 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16903 "Expected same number of elements");
16904 assert((VT.getVectorElementType() == MVT::i16 ||
16905 VT.getVectorElementType() == MVT::i32 ||
16906 VT.getVectorElementType() == MVT::i64) &&
16907 "Unexpected element type");
16908 assert((InVT.getVectorElementType() == MVT::i8 ||
16909 InVT.getVectorElementType() == MVT::i16 ||
16910 InVT.getVectorElementType() == MVT::i32) &&
16911 "Unexpected element type");
16913 if (Subtarget.hasInt256())
16914 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16916 // Optimize vectors in AVX mode:
16919 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16920 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16921 // Concat upper and lower parts.
16924 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16925 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16926 // Concat upper and lower parts.
16929 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16930 SDValue Undef = DAG.getUNDEF(InVT);
16931 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16932 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16933 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16935 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16936 VT.getVectorNumElements()/2);
16938 OpLo = DAG.getBitcast(HVT, OpLo);
16939 OpHi = DAG.getBitcast(HVT, OpHi);
16941 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16944 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16945 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16946 const SDLoc &dl, SelectionDAG &DAG) {
16947 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16948 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16949 DAG.getIntPtrConstant(0, dl));
16950 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16951 DAG.getIntPtrConstant(8, dl));
16952 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16953 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16954 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16955 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16958 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16959 const X86Subtarget &Subtarget,
16960 SelectionDAG &DAG) {
16961 MVT VT = Op->getSimpleValueType(0);
16962 SDValue In = Op->getOperand(0);
16963 MVT InVT = In.getSimpleValueType();
16964 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16966 unsigned NumElts = VT.getVectorNumElements();
16968 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16969 // avoids a constant pool load.
16970 if (VT.getVectorElementType() != MVT::i8) {
16971 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16972 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16973 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16976 // Extend VT if BWI is not supported.
16978 if (!Subtarget.hasBWI()) {
16979 // If v16i32 is to be avoided, we'll need to split and concatenate.
16980 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16981 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16983 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16986 // Widen to 512-bits if VLX is not supported.
16987 MVT WideVT = ExtVT;
16988 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16989 NumElts *= 512 / ExtVT.getSizeInBits();
16990 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16991 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16992 In, DAG.getIntPtrConstant(0, DL));
16993 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16997 SDValue One = DAG.getConstant(1, DL, WideVT);
16998 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
17000 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
17002 // Truncate if we had to extend above.
17004 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
17005 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
17008 // Extract back to 128/256-bit if we widened.
17010 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
17011 DAG.getIntPtrConstant(0, DL));
17013 return SelectedVal;
17016 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17017 SelectionDAG &DAG) {
17018 SDValue In = Op.getOperand(0);
17019 MVT SVT = In.getSimpleValueType();
17021 if (SVT.getVectorElementType() == MVT::i1)
17022 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
17024 assert(Subtarget.hasAVX() && "Expected AVX support");
17025 return LowerAVXExtend(Op, DAG, Subtarget);
17028 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
17029 /// It makes use of the fact that vectors with enough leading sign/zero bits
17030 /// prevent the PACKSS/PACKUS from saturating the results.
17031 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
17032 /// within each 128-bit lane.
17033 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
17034 const SDLoc &DL, SelectionDAG &DAG,
17035 const X86Subtarget &Subtarget) {
17036 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
17037 "Unexpected PACK opcode");
17039 // Requires SSE2 but AVX512 has fast vector truncate.
17040 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
17043 EVT SrcVT = In.getValueType();
17045 // No truncation required, we might get here due to recursive calls.
17046 if (SrcVT == DstVT)
17049 // We only support vector truncation to 64bits or greater from a
17050 // 128bits or greater source.
17051 unsigned DstSizeInBits = DstVT.getSizeInBits();
17052 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
17053 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
17056 unsigned NumElems = SrcVT.getVectorNumElements();
17057 if (!isPowerOf2_32(NumElems))
17060 LLVMContext &Ctx = *DAG.getContext();
17061 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
17062 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
17064 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
17066 // Pack to the largest type possible:
17067 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
17068 EVT InVT = MVT::i16, OutVT = MVT::i8;
17069 if (SrcVT.getScalarSizeInBits() > 16 &&
17070 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
17075 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
17076 if (SrcVT.is128BitVector()) {
17077 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
17078 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
17079 In = DAG.getBitcast(InVT, In);
17080 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
17081 Res = extractSubVector(Res, 0, DAG, DL, 64);
17082 return DAG.getBitcast(DstVT, Res);
17085 // Extract lower/upper subvectors.
17086 unsigned NumSubElts = NumElems / 2;
17087 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17088 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17090 unsigned SubSizeInBits = SrcSizeInBits / 2;
17091 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
17092 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
17094 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
17095 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
17096 Lo = DAG.getBitcast(InVT, Lo);
17097 Hi = DAG.getBitcast(InVT, Hi);
17098 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17099 return DAG.getBitcast(DstVT, Res);
17102 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
17103 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
17104 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
17105 Lo = DAG.getBitcast(InVT, Lo);
17106 Hi = DAG.getBitcast(InVT, Hi);
17107 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17109 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
17110 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
17111 Res = DAG.getBitcast(MVT::v4i64, Res);
17112 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
17114 if (DstVT.is256BitVector())
17115 return DAG.getBitcast(DstVT, Res);
17117 // If 512bit -> 128bit truncate another stage.
17118 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17119 Res = DAG.getBitcast(PackedVT, Res);
17120 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17123 // Recursively pack lower/upper subvectors, concat result and pack again.
17124 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
17125 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
17126 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
17127 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
17129 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17130 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
17131 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17134 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
17135 const X86Subtarget &Subtarget) {
17138 MVT VT = Op.getSimpleValueType();
17139 SDValue In = Op.getOperand(0);
17140 MVT InVT = In.getSimpleValueType();
17142 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
17144 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
17145 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
17146 if (InVT.getScalarSizeInBits() <= 16) {
17147 if (Subtarget.hasBWI()) {
17148 // legal, will go to VPMOVB2M, VPMOVW2M
17149 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17150 // We need to shift to get the lsb into sign position.
17151 // Shift packed bytes not supported natively, bitcast to word
17152 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
17153 In = DAG.getNode(ISD::SHL, DL, ExtVT,
17154 DAG.getBitcast(ExtVT, In),
17155 DAG.getConstant(ShiftInx, DL, ExtVT));
17156 In = DAG.getBitcast(InVT, In);
17158 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17161 // Use TESTD/Q, extended vector to packed dword/qword.
17162 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
17163 "Unexpected vector type.");
17164 unsigned NumElts = InVT.getVectorNumElements();
17165 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
17166 // We need to change to a wider element type that we have support for.
17167 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
17168 // For 16 element vectors we extend to v16i32 unless we are explicitly
17169 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
17170 // we need to split into two 8 element vectors which we can extend to v8i32,
17171 // truncate and concat the results. There's an additional complication if
17172 // the original type is v16i8. In that case we can't split the v16i8 so
17173 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
17174 // to v8i32, truncate that to v8i1 and concat the two halves.
17175 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
17176 if (InVT == MVT::v16i8) {
17177 // First we need to sign extend up to 256-bits so we can split that.
17178 InVT = MVT::v16i16;
17179 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
17181 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
17182 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
17183 // We're split now, just emit two truncates and a concat. The two
17184 // truncates will trigger legalization to come back to this function.
17185 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
17186 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
17187 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17189 // We either have 8 elements or we're allowed to use 512-bit vectors.
17190 // If we have VLX, we want to use the narrowest vector that can get the
17191 // job done so we use vXi32.
17192 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
17193 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
17194 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
17196 ShiftInx = InVT.getScalarSizeInBits() - 1;
17199 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17200 // We need to shift to get the lsb into sign position.
17201 In = DAG.getNode(ISD::SHL, DL, InVT, In,
17202 DAG.getConstant(ShiftInx, DL, InVT));
17204 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
17205 if (Subtarget.hasDQI())
17206 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17208 return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
17212 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
17214 MVT VT = Op.getSimpleValueType();
17215 SDValue In = Op.getOperand(0);
17216 MVT InVT = In.getSimpleValueType();
17217 unsigned InNumEltBits = InVT.getScalarSizeInBits();
17219 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
17220 "Invalid TRUNCATE operation");
17222 if (VT.getVectorElementType() == MVT::i1)
17223 return LowerTruncateVecI1(Op, DAG, Subtarget);
17225 // vpmovqb/w/d, vpmovdb/w, vpmovwb
17226 if (Subtarget.hasAVX512()) {
17227 // word to byte only under BWI
17228 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
17229 // Make sure we're allowed to promote 512-bits.
17230 if (Subtarget.canExtendTo512DQ())
17231 return DAG.getNode(ISD::TRUNCATE, DL, VT,
17232 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
17238 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
17239 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
17241 // Truncate with PACKUS if we are truncating a vector with leading zero bits
17242 // that extend all the way to the packed/truncated value.
17243 // Pre-SSE41 we can only use PACKUSWB.
17245 DAG.computeKnownBits(In, Known);
17246 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
17248 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
17251 // Truncate with PACKSS if we are truncating a vector with sign-bits that
17252 // extend all the way to the packed/truncated value.
17253 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
17255 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
17258 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
17259 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
17260 if (Subtarget.hasInt256()) {
17261 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17262 In = DAG.getBitcast(MVT::v8i32, In);
17263 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17264 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17265 DAG.getIntPtrConstant(0, DL));
17268 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17269 DAG.getIntPtrConstant(0, DL));
17270 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17271 DAG.getIntPtrConstant(2, DL));
17272 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17273 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17274 static const int ShufMask[] = {0, 2, 4, 6};
17275 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17278 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17279 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17280 if (Subtarget.hasInt256()) {
17281 In = DAG.getBitcast(MVT::v32i8, In);
17283 // The PSHUFB mask:
17284 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17285 -1, -1, -1, -1, -1, -1, -1, -1,
17286 16, 17, 20, 21, 24, 25, 28, 29,
17287 -1, -1, -1, -1, -1, -1, -1, -1 };
17288 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17289 In = DAG.getBitcast(MVT::v4i64, In);
17291 static const int ShufMask2[] = {0, 2, -1, -1};
17292 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17293 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17294 DAG.getIntPtrConstant(0, DL));
17295 return DAG.getBitcast(VT, In);
17298 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17299 DAG.getIntPtrConstant(0, DL));
17301 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17302 DAG.getIntPtrConstant(4, DL));
17304 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17305 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17307 // The PSHUFB mask:
17308 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17309 -1, -1, -1, -1, -1, -1, -1, -1};
17311 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17312 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17314 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17315 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17317 // The MOVLHPS Mask:
17318 static const int ShufMask2[] = {0, 1, 4, 5};
17319 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17320 return DAG.getBitcast(MVT::v8i16, res);
17323 // Handle truncation of V256 to V128 using shuffles.
17324 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17326 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17328 unsigned NumElems = VT.getVectorNumElements();
17329 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17331 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17332 // Prepare truncation shuffle mask
17333 for (unsigned i = 0; i != NumElems; ++i)
17334 MaskVec[i] = i * 2;
17335 In = DAG.getBitcast(NVT, In);
17336 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17338 DAG.getIntPtrConstant(0, DL));
17341 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17342 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17343 MVT VT = Op.getSimpleValueType();
17345 if (VT.isVector()) {
17346 SDValue Src = Op.getOperand(0);
17349 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17350 MVT ResVT = MVT::v4i32;
17351 MVT TruncVT = MVT::v4i1;
17352 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17353 if (!IsSigned && !Subtarget.hasVLX()) {
17354 // Widen to 512-bits.
17355 ResVT = MVT::v8i32;
17356 TruncVT = MVT::v8i1;
17357 Opc = ISD::FP_TO_UINT;
17358 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17359 DAG.getUNDEF(MVT::v8f64),
17360 Src, DAG.getIntPtrConstant(0, dl));
17362 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17363 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17364 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17365 DAG.getIntPtrConstant(0, dl));
17368 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17369 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17370 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17371 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17372 DAG.getUNDEF(MVT::v2f32)));
17378 assert(!VT.isVector());
17380 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17381 IsSigned, /*IsReplace=*/ false);
17382 SDValue FIST = Vals.first, StackSlot = Vals.second;
17383 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17384 if (!FIST.getNode())
17387 if (StackSlot.getNode())
17388 // Load the result.
17389 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17391 // The node is the result.
17395 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17397 MVT VT = Op.getSimpleValueType();
17398 SDValue In = Op.getOperand(0);
17399 MVT SVT = In.getSimpleValueType();
17401 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17403 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17404 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17405 In, DAG.getUNDEF(SVT)));
17408 /// The only differences between FABS and FNEG are the mask and the logic op.
17409 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17410 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17411 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17412 "Wrong opcode for lowering FABS or FNEG.");
17414 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17416 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17417 // into an FNABS. We'll lower the FABS after that if it is still in use.
17419 for (SDNode *User : Op->uses())
17420 if (User->getOpcode() == ISD::FNEG)
17424 MVT VT = Op.getSimpleValueType();
17426 bool IsF128 = (VT == MVT::f128);
17428 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17429 // decide if we should generate a 16-byte constant mask when we only need 4 or
17430 // 8 bytes for the scalar case.
17435 if (VT.isVector()) {
17437 EltVT = VT.getVectorElementType();
17438 } else if (IsF128) {
17439 // SSE instructions are used for optimized f128 logical operations.
17440 LogicVT = MVT::f128;
17443 // There are no scalar bitwise logical SSE/AVX instructions, so we
17444 // generate a 16-byte vector constant and logic op even for the scalar case.
17445 // Using a 16-byte mask allows folding the load of the mask with
17446 // the logic op, so it can save (~4 bytes) on code size.
17447 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17451 unsigned EltBits = EltVT.getSizeInBits();
17452 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17454 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17455 const fltSemantics &Sem =
17456 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17457 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17458 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17460 SDValue Op0 = Op.getOperand(0);
17461 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17463 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17464 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17466 if (VT.isVector() || IsF128)
17467 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17469 // For the scalar case extend to a 128-bit vector, perform the logic op,
17470 // and extract the scalar result back out.
17471 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17472 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17473 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17474 DAG.getIntPtrConstant(0, dl));
17477 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17478 SDValue Mag = Op.getOperand(0);
17479 SDValue Sign = Op.getOperand(1);
17482 // If the sign operand is smaller, extend it first.
17483 MVT VT = Op.getSimpleValueType();
17484 if (Sign.getSimpleValueType().bitsLT(VT))
17485 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17487 // And if it is bigger, shrink it first.
17488 if (Sign.getSimpleValueType().bitsGT(VT))
17489 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17491 // At this point the operands and the result should have the same
17492 // type, and that won't be f80 since that is not custom lowered.
17493 bool IsF128 = (VT == MVT::f128);
17494 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17495 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17496 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17497 "Unexpected type in LowerFCOPYSIGN");
17499 MVT EltVT = VT.getScalarType();
17500 const fltSemantics &Sem =
17501 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17502 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17504 // Perform all scalar logic operations as 16-byte vectors because there are no
17505 // scalar FP logic instructions in SSE.
17506 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17507 // unnecessary splats, but we might miss load folding opportunities. Should
17508 // this decision be based on OptimizeForSize?
17509 bool IsFakeVector = !VT.isVector() && !IsF128;
17512 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17514 // The mask constants are automatically splatted for vector types.
17515 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17516 SDValue SignMask = DAG.getConstantFP(
17517 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17518 SDValue MagMask = DAG.getConstantFP(
17519 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17521 // First, clear all bits but the sign bit from the second operand (sign).
17523 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17524 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17526 // Next, clear the sign bit from the first operand (magnitude).
17527 // TODO: If we had general constant folding for FP logic ops, this check
17528 // wouldn't be necessary.
17530 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17531 APFloat APF = Op0CN->getValueAPF();
17533 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17535 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17537 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17538 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17541 // OR the magnitude value with the sign bit.
17542 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17543 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17544 DAG.getIntPtrConstant(0, dl));
17547 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17548 SDValue N0 = Op.getOperand(0);
17550 MVT VT = Op.getSimpleValueType();
17552 MVT OpVT = N0.getSimpleValueType();
17553 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17554 "Unexpected type for FGETSIGN");
17556 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17557 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17558 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17559 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17560 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17561 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17565 /// Helper for creating a X86ISD::SETCC node.
17566 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17567 SelectionDAG &DAG) {
17568 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17569 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17572 // Check whether an OR'd tree is PTEST-able.
17573 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17574 const X86Subtarget &Subtarget,
17575 SelectionDAG &DAG) {
17576 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17578 if (!Subtarget.hasSSE41())
17581 if (!Op->hasOneUse())
17584 SDNode *N = Op.getNode();
17587 SmallVector<SDValue, 8> Opnds;
17588 DenseMap<SDValue, unsigned> VecInMap;
17589 SmallVector<SDValue, 8> VecIns;
17590 EVT VT = MVT::Other;
17592 // Recognize a special case where a vector is casted into wide integer to
17594 Opnds.push_back(N->getOperand(0));
17595 Opnds.push_back(N->getOperand(1));
17597 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17598 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17599 // BFS traverse all OR'd operands.
17600 if (I->getOpcode() == ISD::OR) {
17601 Opnds.push_back(I->getOperand(0));
17602 Opnds.push_back(I->getOperand(1));
17603 // Re-evaluate the number of nodes to be traversed.
17604 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17608 // Quit if a non-EXTRACT_VECTOR_ELT
17609 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17612 // Quit if without a constant index.
17613 SDValue Idx = I->getOperand(1);
17614 if (!isa<ConstantSDNode>(Idx))
17617 SDValue ExtractedFromVec = I->getOperand(0);
17618 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17619 if (M == VecInMap.end()) {
17620 VT = ExtractedFromVec.getValueType();
17621 // Quit if not 128/256-bit vector.
17622 if (!VT.is128BitVector() && !VT.is256BitVector())
17624 // Quit if not the same type.
17625 if (VecInMap.begin() != VecInMap.end() &&
17626 VT != VecInMap.begin()->first.getValueType())
17628 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17629 VecIns.push_back(ExtractedFromVec);
17631 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17634 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17635 "Not extracted from 128-/256-bit vector.");
17637 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17639 for (DenseMap<SDValue, unsigned>::const_iterator
17640 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17641 // Quit if not all elements are used.
17642 if (I->second != FullMask)
17646 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17648 // Cast all vectors into TestVT for PTEST.
17649 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17650 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17652 // If more than one full vector is evaluated, OR them first before PTEST.
17653 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17654 // Each iteration will OR 2 nodes and append the result until there is only
17655 // 1 node left, i.e. the final OR'd value of all vectors.
17656 SDValue LHS = VecIns[Slot];
17657 SDValue RHS = VecIns[Slot + 1];
17658 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17661 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17662 VecIns.back(), VecIns.back());
17663 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17666 /// return true if \c Op has a use that doesn't just read flags.
17667 static bool hasNonFlagsUse(SDValue Op) {
17668 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17670 SDNode *User = *UI;
17671 unsigned UOpNo = UI.getOperandNo();
17672 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17673 // Look pass truncate.
17674 UOpNo = User->use_begin().getOperandNo();
17675 User = *User->use_begin();
17678 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17679 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17685 /// Emit nodes that will be selected as "test Op0,Op0", or something
17687 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17688 SelectionDAG &DAG) const {
17689 // CF and OF aren't always set the way we want. Determine which
17690 // of these we need.
17691 bool NeedCF = false;
17692 bool NeedOF = false;
17695 case X86::COND_A: case X86::COND_AE:
17696 case X86::COND_B: case X86::COND_BE:
17699 case X86::COND_G: case X86::COND_GE:
17700 case X86::COND_L: case X86::COND_LE:
17701 case X86::COND_O: case X86::COND_NO: {
17702 // Check if we really need to set the
17703 // Overflow flag. If NoSignedWrap is present
17704 // that is not actually needed.
17705 switch (Op->getOpcode()) {
17710 if (Op.getNode()->getFlags().hasNoSignedWrap())
17720 // See if we can use the EFLAGS value from the operand instead of
17721 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17722 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17723 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17724 // Emit a CMP with 0, which is the TEST pattern.
17725 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17726 DAG.getConstant(0, dl, Op.getValueType()));
17728 unsigned Opcode = 0;
17729 unsigned NumOperands = 0;
17731 // Truncate operations may prevent the merge of the SETCC instruction
17732 // and the arithmetic instruction before it. Attempt to truncate the operands
17733 // of the arithmetic instruction and use a reduced bit-width instruction.
17734 bool NeedTruncation = false;
17735 SDValue ArithOp = Op;
17736 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17737 SDValue Arith = Op->getOperand(0);
17738 // Both the trunc and the arithmetic op need to have one user each.
17739 if (Arith->hasOneUse())
17740 switch (Arith.getOpcode()) {
17747 NeedTruncation = true;
17753 // Sometimes flags can be set either with an AND or with an SRL/SHL
17754 // instruction. SRL/SHL variant should be preferred for masks longer than this
17756 const int ShiftToAndMaxMaskWidth = 32;
17757 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17759 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17760 // which may be the result of a CAST. We use the variable 'Op', which is the
17761 // non-casted variable when we check for possible users.
17762 switch (ArithOp.getOpcode()) {
17764 // We only want to rewrite this as a target-specific node with attached
17765 // flags if there is a reasonable chance of either using that to do custom
17766 // instructions selection that can fold some of the memory operands, or if
17767 // only the flags are used. If there are other uses, leave the node alone
17768 // and emit a test instruction.
17769 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17770 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17771 if (UI->getOpcode() != ISD::CopyToReg &&
17772 UI->getOpcode() != ISD::SETCC &&
17773 UI->getOpcode() != ISD::STORE)
17776 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17777 // An add of one will be selected as an INC.
17779 (!Subtarget.slowIncDec() ||
17780 DAG.getMachineFunction().getFunction().optForSize())) {
17781 Opcode = X86ISD::INC;
17786 // An add of negative one (subtract of one) will be selected as a DEC.
17787 if (C->isAllOnesValue() &&
17788 (!Subtarget.slowIncDec() ||
17789 DAG.getMachineFunction().getFunction().optForSize())) {
17790 Opcode = X86ISD::DEC;
17796 // Otherwise use a regular EFLAGS-setting add.
17797 Opcode = X86ISD::ADD;
17802 // If we have a constant logical shift that's only used in a comparison
17803 // against zero turn it into an equivalent AND. This allows turning it into
17804 // a TEST instruction later.
17805 if (ZeroCheck && Op->hasOneUse() &&
17806 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17807 EVT VT = Op.getValueType();
17808 unsigned BitWidth = VT.getSizeInBits();
17809 unsigned ShAmt = Op->getConstantOperandVal(1);
17810 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17812 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17813 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17814 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17815 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17817 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17818 DAG.getConstant(Mask, dl, VT));
17823 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17824 // because a TEST instruction will be better. However, AND should be
17825 // preferred if the instruction can be combined into ANDN.
17826 if (!hasNonFlagsUse(Op)) {
17827 SDValue Op0 = ArithOp->getOperand(0);
17828 SDValue Op1 = ArithOp->getOperand(1);
17829 EVT VT = ArithOp.getValueType();
17830 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17831 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17832 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17834 // If we cannot select an ANDN instruction, check if we can replace
17835 // AND+IMM64 with a shift before giving up. This is possible for masks
17836 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17837 if (!isProperAndn) {
17841 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17842 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17846 const APInt &Mask = CN->getAPIntValue();
17847 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17848 break; // Prefer TEST instruction.
17850 unsigned BitWidth = Mask.getBitWidth();
17851 unsigned LeadingOnes = Mask.countLeadingOnes();
17852 unsigned TrailingZeros = Mask.countTrailingZeros();
17854 if (LeadingOnes + TrailingZeros == BitWidth) {
17855 assert(TrailingZeros < VT.getSizeInBits() &&
17856 "Shift amount should be less than the type width");
17857 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17858 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17859 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17863 unsigned LeadingZeros = Mask.countLeadingZeros();
17864 unsigned TrailingOnes = Mask.countTrailingOnes();
17866 if (LeadingZeros + TrailingOnes == BitWidth) {
17867 assert(LeadingZeros < VT.getSizeInBits() &&
17868 "Shift amount should be less than the type width");
17869 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17870 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17871 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17882 // Similar to ISD::ADD above, check if the uses will preclude useful
17883 // lowering of the target-specific node.
17884 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17885 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17886 if (UI->getOpcode() != ISD::CopyToReg &&
17887 UI->getOpcode() != ISD::SETCC &&
17888 UI->getOpcode() != ISD::STORE)
17891 // Otherwise use a regular EFLAGS-setting instruction.
17892 switch (ArithOp.getOpcode()) {
17893 default: llvm_unreachable("unexpected operator!");
17894 case ISD::SUB: Opcode = X86ISD::SUB; break;
17895 case ISD::XOR: Opcode = X86ISD::XOR; break;
17896 case ISD::AND: Opcode = X86ISD::AND; break;
17897 case ISD::OR: Opcode = X86ISD::OR; break;
17909 return SDValue(Op.getNode(), 1);
17915 // If we found that truncation is beneficial, perform the truncation and
17917 if (NeedTruncation) {
17918 EVT VT = Op.getValueType();
17919 SDValue WideVal = Op->getOperand(0);
17920 EVT WideVT = WideVal.getValueType();
17921 unsigned ConvertedOp = 0;
17922 // Use a target machine opcode to prevent further DAGCombine
17923 // optimizations that may separate the arithmetic operations
17924 // from the setcc node.
17925 switch (WideVal.getOpcode()) {
17927 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17928 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17929 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17930 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17931 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17935 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17936 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17937 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17938 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17939 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17940 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17946 // Emit a CMP with 0, which is the TEST pattern.
17947 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17948 DAG.getConstant(0, dl, Op.getValueType()));
17950 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17951 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17953 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17954 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17955 return SDValue(New.getNode(), 1);
17958 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17960 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17961 const SDLoc &dl, SelectionDAG &DAG) const {
17962 if (isNullConstant(Op1))
17963 return EmitTest(Op0, X86CC, dl, DAG);
17965 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17966 "Unexpected comparison operation for MVT::i1 operands");
17968 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17969 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17970 // Only promote the compare up to I32 if it is a 16 bit operation
17971 // with an immediate. 16 bit immediates are to be avoided.
17972 if ((Op0.getValueType() == MVT::i16 &&
17973 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17974 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17975 !Subtarget.isAtom()) {
17976 unsigned ExtendOp =
17977 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17978 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17979 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17981 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17982 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17983 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17984 return SDValue(Sub.getNode(), 1);
17986 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17989 /// Convert a comparison if required by the subtarget.
17990 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17991 SelectionDAG &DAG) const {
17992 // If the subtarget does not support the FUCOMI instruction, floating-point
17993 // comparisons have to be converted.
17994 if (Subtarget.hasCMov() ||
17995 Cmp.getOpcode() != X86ISD::CMP ||
17996 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17997 !Cmp.getOperand(1).getValueType().isFloatingPoint())
18000 // The instruction selector will select an FUCOM instruction instead of
18001 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
18002 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
18003 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
18005 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
18006 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
18007 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
18008 DAG.getConstant(8, dl, MVT::i8));
18009 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
18011 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
18012 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
18013 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
18016 /// Check if replacement of SQRT with RSQRT should be disabled.
18017 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
18018 EVT VT = Op.getValueType();
18020 // We never want to use both SQRT and RSQRT instructions for the same input.
18021 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
18025 return Subtarget.hasFastVectorFSQRT();
18026 return Subtarget.hasFastScalarFSQRT();
18029 /// The minimum architected relative accuracy is 2^-12. We need one
18030 /// Newton-Raphson step to have a good float result (24 bits of precision).
18031 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
18032 SelectionDAG &DAG, int Enabled,
18033 int &RefinementSteps,
18034 bool &UseOneConstNR,
18035 bool Reciprocal) const {
18036 EVT VT = Op.getValueType();
18038 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
18039 // It is likely not profitable to do this for f64 because a double-precision
18040 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
18041 // instructions: convert to single, rsqrtss, convert back to double, refine
18042 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
18043 // along with FMA, this could be a throughput win.
18044 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
18045 // after legalize types.
18046 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18047 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
18048 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
18049 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18050 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18051 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18052 RefinementSteps = 1;
18054 UseOneConstNR = false;
18055 // There is no FSQRT for 512-bits, but there is RSQRT14.
18056 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
18057 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18062 /// The minimum architected relative accuracy is 2^-12. We need one
18063 /// Newton-Raphson step to have a good float result (24 bits of precision).
18064 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
18066 int &RefinementSteps) const {
18067 EVT VT = Op.getValueType();
18069 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
18070 // It is likely not profitable to do this for f64 because a double-precision
18071 // reciprocal estimate with refinement on x86 prior to FMA requires
18072 // 15 instructions: convert to single, rcpss, convert back to double, refine
18073 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
18074 // along with FMA, this could be a throughput win.
18076 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18077 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
18078 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18079 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18080 // Enable estimate codegen with 1 refinement step for vector division.
18081 // Scalar division estimates are disabled because they break too much
18082 // real-world code. These defaults are intended to match GCC behavior.
18083 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
18086 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18087 RefinementSteps = 1;
18089 // There is no FSQRT for 512-bits, but there is RCP14.
18090 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
18091 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18096 /// If we have at least two divisions that use the same divisor, convert to
18097 /// multiplication by a reciprocal. This may need to be adjusted for a given
18098 /// CPU if a division's cost is not at least twice the cost of a multiplication.
18099 /// This is because we still need one division to calculate the reciprocal and
18100 /// then we need two multiplies by that reciprocal as replacements for the
18101 /// original divisions.
18102 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
18106 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
18107 /// according to equal/not-equal condition code \p CC.
18108 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
18109 const SDLoc &dl, SelectionDAG &DAG) {
18110 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
18111 // instruction. Since the shift amount is in-range-or-undefined, we know
18112 // that doing a bittest on the i32 value is ok. We extend to i32 because
18113 // the encoding for the i16 version is larger than the i32 version.
18114 // Also promote i16 to i32 for performance / code size reason.
18115 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
18116 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
18118 // See if we can use the 32-bit instruction instead of the 64-bit one for a
18119 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
18120 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
18121 // known to be zero.
18122 if (Src.getValueType() == MVT::i64 &&
18123 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
18124 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
18126 // If the operand types disagree, extend the shift amount to match. Since
18127 // BT ignores high bits (like shifts) we can use anyextend.
18128 if (Src.getValueType() != BitNo.getValueType())
18129 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
18131 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
18132 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
18133 return getSETCC(Cond, BT, dl , DAG);
18136 /// Result of 'and' is compared against zero. Change to a BT node if possible.
18137 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
18138 const SDLoc &dl, SelectionDAG &DAG) {
18139 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
18140 SDValue Op0 = And.getOperand(0);
18141 SDValue Op1 = And.getOperand(1);
18142 if (Op0.getOpcode() == ISD::TRUNCATE)
18143 Op0 = Op0.getOperand(0);
18144 if (Op1.getOpcode() == ISD::TRUNCATE)
18145 Op1 = Op1.getOperand(0);
18148 if (Op1.getOpcode() == ISD::SHL)
18149 std::swap(Op0, Op1);
18150 if (Op0.getOpcode() == ISD::SHL) {
18151 if (isOneConstant(Op0.getOperand(0))) {
18152 // If we looked past a truncate, check that it's only truncating away
18154 unsigned BitWidth = Op0.getValueSizeInBits();
18155 unsigned AndBitWidth = And.getValueSizeInBits();
18156 if (BitWidth > AndBitWidth) {
18158 DAG.computeKnownBits(Op0, Known);
18159 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
18163 RHS = Op0.getOperand(1);
18165 } else if (Op1.getOpcode() == ISD::Constant) {
18166 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
18167 uint64_t AndRHSVal = AndRHS->getZExtValue();
18168 SDValue AndLHS = Op0;
18170 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
18171 LHS = AndLHS.getOperand(0);
18172 RHS = AndLHS.getOperand(1);
18174 // Use BT if the immediate can't be encoded in a TEST instruction or we
18175 // are optimizing for size and the immedaite won't fit in a byte.
18176 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
18177 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
18178 isPowerOf2_64(AndRHSVal)) {
18180 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
18186 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
18191 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
18193 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
18198 // SSE Condition code mapping:
18207 switch (SetCCOpcode) {
18208 default: llvm_unreachable("Unexpected SETCC condition");
18210 case ISD::SETEQ: SSECC = 0; break;
18212 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
18214 case ISD::SETOLT: SSECC = 1; break;
18216 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
18218 case ISD::SETOLE: SSECC = 2; break;
18219 case ISD::SETUO: SSECC = 3; break;
18221 case ISD::SETNE: SSECC = 4; break;
18222 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
18223 case ISD::SETUGE: SSECC = 5; break;
18224 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
18225 case ISD::SETUGT: SSECC = 6; break;
18226 case ISD::SETO: SSECC = 7; break;
18227 case ISD::SETUEQ: SSECC = 8; break;
18228 case ISD::SETONE: SSECC = 12; break;
18231 std::swap(Op0, Op1);
18236 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
18237 /// concatenate the result back.
18238 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
18239 MVT VT = Op.getSimpleValueType();
18241 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
18242 "Unsupported value type for operation");
18244 unsigned NumElems = VT.getVectorNumElements();
18246 SDValue CC = Op.getOperand(2);
18248 // Extract the LHS vectors
18249 SDValue LHS = Op.getOperand(0);
18250 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
18251 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
18253 // Extract the RHS vectors
18254 SDValue RHS = Op.getOperand(1);
18255 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
18256 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
18258 // Issue the operation on the smaller types and concatenate the result back
18259 MVT EltVT = VT.getVectorElementType();
18260 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18261 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18262 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
18263 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18266 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18268 SDValue Op0 = Op.getOperand(0);
18269 SDValue Op1 = Op.getOperand(1);
18270 SDValue CC = Op.getOperand(2);
18271 MVT VT = Op.getSimpleValueType();
18274 assert(VT.getVectorElementType() == MVT::i1 &&
18275 "Cannot set masked compare for this operation");
18277 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18279 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18280 // This helps with vptestm matching.
18281 // TODO: Should we just canonicalize the setcc during DAG combine?
18282 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18283 ISD::isBuildVectorAllZeros(Op0.getNode()))
18284 std::swap(Op0, Op1);
18286 // Prefer SETGT over SETLT.
18287 if (SetCCOpcode == ISD::SETLT) {
18288 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
18289 std::swap(Op0, Op1);
18292 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
18295 /// Try to turn a VSETULT into a VSETULE by modifying its second
18296 /// operand \p Op1. If non-trivial (for example because it's not constant)
18297 /// return an empty value.
18298 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18299 SelectionDAG &DAG) {
18300 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18304 MVT VT = Op1.getSimpleValueType();
18305 MVT EVT = VT.getVectorElementType();
18306 unsigned n = VT.getVectorNumElements();
18307 SmallVector<SDValue, 8> ULTOp1;
18309 for (unsigned i = 0; i < n; ++i) {
18310 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18311 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18314 // Avoid underflow.
18315 APInt Val = Elt->getAPIntValue();
18319 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18322 return DAG.getBuildVector(VT, dl, ULTOp1);
18325 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18327 /// t = psubus Op0, Op1
18328 /// pcmpeq t, <0..0>
18329 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18330 ISD::CondCode Cond, const SDLoc &dl,
18331 const X86Subtarget &Subtarget,
18332 SelectionDAG &DAG) {
18333 if (!Subtarget.hasSSE2())
18336 MVT VET = VT.getVectorElementType();
18337 if (VET != MVT::i8 && VET != MVT::i16)
18343 case ISD::SETULT: {
18344 // If the comparison is against a constant we can turn this into a
18345 // setule. With psubus, setule does not require a swap. This is
18346 // beneficial because the constant in the register is no longer
18347 // destructed as the destination so it can be hoisted out of a loop.
18348 // Only do this pre-AVX since vpcmp* is no longer destructive.
18349 if (Subtarget.hasAVX())
18351 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18357 // Psubus is better than flip-sign because it requires no inversion.
18359 std::swap(Op0, Op1);
18365 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18366 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18367 getZeroVector(VT, Subtarget, DAG, dl));
18370 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18371 SelectionDAG &DAG) {
18372 SDValue Op0 = Op.getOperand(0);
18373 SDValue Op1 = Op.getOperand(1);
18374 SDValue CC = Op.getOperand(2);
18375 MVT VT = Op.getSimpleValueType();
18376 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18377 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18382 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18383 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18387 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18388 assert(VT.getVectorNumElements() <= 16);
18389 Opc = X86ISD::CMPM;
18391 Opc = X86ISD::CMPP;
18392 // The SSE/AVX packed FP comparison nodes are defined with a
18393 // floating-point vector result that matches the operand type. This allows
18394 // them to work with an SSE1 target (integer vector types are not legal).
18395 VT = Op0.getSimpleValueType();
18398 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18399 // emit two comparisons and a logic op to tie them together.
18401 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18402 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18403 // LLVM predicate is SETUEQ or SETONE.
18405 unsigned CombineOpc;
18406 if (Cond == ISD::SETUEQ) {
18409 CombineOpc = X86ISD::FOR;
18411 assert(Cond == ISD::SETONE);
18414 CombineOpc = X86ISD::FAND;
18417 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18418 DAG.getConstant(CC0, dl, MVT::i8));
18419 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18420 DAG.getConstant(CC1, dl, MVT::i8));
18421 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18423 // Handle all other FP comparisons here.
18424 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18425 DAG.getConstant(SSECC, dl, MVT::i8));
18428 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18429 // result type of SETCC. The bitcast is expected to be optimized away
18430 // during combining/isel.
18431 if (Opc == X86ISD::CMPP)
18432 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18437 MVT VTOp0 = Op0.getSimpleValueType();
18438 assert(VTOp0 == Op1.getSimpleValueType() &&
18439 "Expected operands with same type!");
18440 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18441 "Invalid number of packed elements for source and destination!");
18443 // This is being called by type legalization because v2i32 is marked custom
18444 // for result type legalization for v2f32.
18445 if (VTOp0 == MVT::v2i32)
18448 // The non-AVX512 code below works under the assumption that source and
18449 // destination types are the same.
18450 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18451 "Value types for source and destination must be the same!");
18453 // Break 256-bit integer vector compare into smaller ones.
18454 if (VT.is256BitVector() && !Subtarget.hasInt256())
18455 return Lower256IntVSETCC(Op, DAG);
18457 // The result is boolean, but operands are int/float
18458 if (VT.getVectorElementType() == MVT::i1) {
18459 // In AVX-512 architecture setcc returns mask with i1 elements,
18460 // But there is no compare instruction for i8 and i16 elements in KNL.
18461 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18462 "Unexpected operand type");
18463 return LowerIntVSETCC_AVX512(Op, DAG);
18466 // Lower using XOP integer comparisons.
18467 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18468 // Translate compare code to XOP PCOM compare mode.
18469 unsigned CmpMode = 0;
18471 default: llvm_unreachable("Unexpected SETCC condition");
18473 case ISD::SETLT: CmpMode = 0x00; break;
18475 case ISD::SETLE: CmpMode = 0x01; break;
18477 case ISD::SETGT: CmpMode = 0x02; break;
18479 case ISD::SETGE: CmpMode = 0x03; break;
18480 case ISD::SETEQ: CmpMode = 0x04; break;
18481 case ISD::SETNE: CmpMode = 0x05; break;
18484 // Are we comparing unsigned or signed integers?
18486 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18488 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18489 DAG.getConstant(CmpMode, dl, MVT::i8));
18492 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18493 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18494 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18495 SDValue BC0 = peekThroughBitcasts(Op0);
18496 if (BC0.getOpcode() == ISD::AND) {
18498 SmallVector<APInt, 64> EltBits;
18499 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18500 VT.getScalarSizeInBits(), UndefElts,
18501 EltBits, false, false)) {
18502 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18504 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18510 // If this is a SETNE against the signed minimum value, change it to SETGT.
18511 // If this is a SETNE against the signed maximum value, change it to SETLT.
18512 // which will be swapped to SETGT.
18513 // Otherwise we use PCMPEQ+invert.
18515 if (Cond == ISD::SETNE &&
18516 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18517 if (ConstValue.isMinSignedValue())
18519 else if (ConstValue.isMaxSignedValue())
18523 // If both operands are known non-negative, then an unsigned compare is the
18524 // same as a signed compare and there's no need to flip signbits.
18525 // TODO: We could check for more general simplifications here since we're
18526 // computing known bits.
18527 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18528 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18530 // Special case: Use min/max operations for unsigned compares. We only want
18531 // to do this for unsigned compares if we need to flip signs or if it allows
18532 // use to avoid an invert.
18533 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18534 if (ISD::isUnsignedIntSetCC(Cond) &&
18535 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18536 TLI.isOperationLegal(ISD::UMIN, VT)) {
18537 bool Invert = false;
18540 default: llvm_unreachable("Unexpected condition code");
18541 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18542 case ISD::SETULE: Opc = ISD::UMIN; break;
18543 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18544 case ISD::SETUGE: Opc = ISD::UMAX; break;
18547 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18548 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18550 // If the logical-not of the result is required, perform that now.
18552 Result = DAG.getNOT(dl, Result, VT);
18557 // Try to use SUBUS and PCMPEQ.
18558 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18561 // We are handling one of the integer comparisons here. Since SSE only has
18562 // GT and EQ comparisons for integer, swapping operands and multiple
18563 // operations may be required for some comparisons.
18564 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18566 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18567 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18568 bool Invert = Cond == ISD::SETNE ||
18569 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18572 std::swap(Op0, Op1);
18574 // Check that the operation in question is available (most are plain SSE2,
18575 // but PCMPGTQ and PCMPEQQ have different requirements).
18576 if (VT == MVT::v2i64) {
18577 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18578 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18580 // First cast everything to the right type.
18581 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18582 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18584 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18585 // bits of the inputs before performing those operations. The lower
18586 // compare is always unsigned.
18589 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18591 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18592 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18593 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18595 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18596 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18598 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18599 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18600 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18602 // Create masks for only the low parts/high parts of the 64 bit integers.
18603 static const int MaskHi[] = { 1, 1, 3, 3 };
18604 static const int MaskLo[] = { 0, 0, 2, 2 };
18605 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18606 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18607 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18609 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18610 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18613 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18615 return DAG.getBitcast(VT, Result);
18618 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18619 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18620 // pcmpeqd + pshufd + pand.
18621 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18623 // First cast everything to the right type.
18624 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18625 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18628 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18630 // Make sure the lower and upper halves are both all-ones.
18631 static const int Mask[] = { 1, 0, 3, 2 };
18632 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18633 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18636 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18638 return DAG.getBitcast(VT, Result);
18642 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18643 // bits of the inputs before performing those operations.
18645 MVT EltVT = VT.getVectorElementType();
18646 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18648 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18649 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18652 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18654 // If the logical-not of the result is required, perform that now.
18656 Result = DAG.getNOT(dl, Result, VT);
18661 // Try to select this as a KTEST+SETCC if possible.
18662 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18663 const SDLoc &dl, SelectionDAG &DAG,
18664 const X86Subtarget &Subtarget) {
18665 // Only support equality comparisons.
18666 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18669 // Must be a bitcast from vXi1.
18670 if (Op0.getOpcode() != ISD::BITCAST)
18673 Op0 = Op0.getOperand(0);
18674 MVT VT = Op0.getSimpleValueType();
18675 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18676 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18677 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18680 X86::CondCode X86CC;
18681 if (isNullConstant(Op1)) {
18682 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18683 } else if (isAllOnesConstant(Op1)) {
18684 // C flag is set for all ones.
18685 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18689 // If the input is an OR, we can combine it's operands into the KORTEST.
18692 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18693 LHS = Op0.getOperand(0);
18694 RHS = Op0.getOperand(1);
18697 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18698 return getSETCC(X86CC, KORTEST, dl, DAG);
18701 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18703 MVT VT = Op.getSimpleValueType();
18705 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18707 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18708 SDValue Op0 = Op.getOperand(0);
18709 SDValue Op1 = Op.getOperand(1);
18711 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18713 // Optimize to BT if possible.
18714 // Lower (X & (1 << N)) == 0 to BT(X, N).
18715 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18716 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18717 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18718 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18719 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18723 // Try to use PTEST for a tree ORs equality compared with 0.
18724 // TODO: We could do AND tree with all 1s as well by using the C flag.
18725 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18726 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18727 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18731 // Try to lower using KTEST.
18732 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18735 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18737 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18738 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18740 // If the input is a setcc, then reuse the input setcc or use a new one with
18741 // the inverted condition.
18742 if (Op0.getOpcode() == X86ISD::SETCC) {
18743 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18744 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18748 CCode = X86::GetOppositeBranchCondition(CCode);
18749 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18753 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18754 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18755 if (X86CC == X86::COND_INVALID)
18758 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18759 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18760 return getSETCC(X86CC, EFLAGS, dl, DAG);
18763 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18764 SDValue LHS = Op.getOperand(0);
18765 SDValue RHS = Op.getOperand(1);
18766 SDValue Carry = Op.getOperand(2);
18767 SDValue Cond = Op.getOperand(3);
18770 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18771 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18773 // Recreate the carry if needed.
18774 EVT CarryVT = Carry.getValueType();
18775 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18776 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18777 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18779 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18780 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18781 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18784 /// Return true if opcode is a X86 logical comparison.
18785 static bool isX86LogicalCmp(SDValue Op) {
18786 unsigned Opc = Op.getOpcode();
18787 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18788 Opc == X86ISD::SAHF)
18790 if (Op.getResNo() == 1 &&
18791 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18792 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18793 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18794 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18797 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18803 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18804 if (V.getOpcode() != ISD::TRUNCATE)
18807 SDValue VOp0 = V.getOperand(0);
18808 unsigned InBits = VOp0.getValueSizeInBits();
18809 unsigned Bits = V.getValueSizeInBits();
18810 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18813 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18814 bool AddTest = true;
18815 SDValue Cond = Op.getOperand(0);
18816 SDValue Op1 = Op.getOperand(1);
18817 SDValue Op2 = Op.getOperand(2);
18819 MVT VT = Op1.getSimpleValueType();
18822 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18823 // are available or VBLENDV if AVX is available.
18824 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18825 if (Cond.getOpcode() == ISD::SETCC &&
18826 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18827 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18828 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18829 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18830 unsigned SSECC = translateX86FSETCC(
18831 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18833 if (Subtarget.hasAVX512()) {
18834 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18835 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18836 assert(!VT.isVector() && "Not a scalar type?");
18837 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18840 if (SSECC < 8 || Subtarget.hasAVX()) {
18841 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18842 DAG.getConstant(SSECC, DL, MVT::i8));
18844 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18845 // of 3 logic instructions for size savings and potentially speed.
18846 // Unfortunately, there is no scalar form of VBLENDV.
18848 // If either operand is a constant, don't try this. We can expect to
18849 // optimize away at least one of the logic instructions later in that
18850 // case, so that sequence would be faster than a variable blend.
18852 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18853 // uses XMM0 as the selection register. That may need just as many
18854 // instructions as the AND/ANDN/OR sequence due to register moves, so
18857 if (Subtarget.hasAVX() &&
18858 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18860 // Convert to vectors, do a VSELECT, and convert back to scalar.
18861 // All of the conversions should be optimized away.
18863 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18864 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18865 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18866 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18868 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18869 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18871 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18873 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18874 VSel, DAG.getIntPtrConstant(0, DL));
18876 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18877 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18878 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18882 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18883 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18884 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18885 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18888 // For v64i1 without 64-bit support we need to split and rejoin.
18889 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18890 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18891 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18892 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18893 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18894 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18895 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18896 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18897 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18900 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18902 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18903 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18904 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18905 Op1Scalar = Op1.getOperand(0);
18907 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18908 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18909 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18910 Op2Scalar = Op2.getOperand(0);
18911 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18912 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18913 Op1Scalar, Op2Scalar);
18914 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18915 return DAG.getBitcast(VT, newSelect);
18916 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18918 DAG.getIntPtrConstant(0, DL));
18922 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18923 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18924 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18925 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18926 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18927 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18928 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18929 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18932 if (Cond.getOpcode() == ISD::SETCC) {
18933 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18935 // If the condition was updated, it's possible that the operands of the
18936 // select were also updated (for example, EmitTest has a RAUW). Refresh
18937 // the local references to the select operands in case they got stale.
18938 Op1 = Op.getOperand(1);
18939 Op2 = Op.getOperand(2);
18943 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18944 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18945 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18946 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18947 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18948 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18949 if (Cond.getOpcode() == X86ISD::SETCC &&
18950 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18951 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18952 SDValue Cmp = Cond.getOperand(1);
18953 unsigned CondCode =
18954 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18956 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18957 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18958 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18959 SDValue CmpOp0 = Cmp.getOperand(0);
18961 // Apply further optimizations for special cases
18962 // (select (x != 0), -1, 0) -> neg & sbb
18963 // (select (x == 0), 0, -1) -> neg & sbb
18964 if (isNullConstant(Y) &&
18965 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18966 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18967 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18968 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18969 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18970 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18971 SDValue(Neg.getNode(), 1));
18975 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18976 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18977 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18979 SDValue Res = // Res = 0 or -1.
18980 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18981 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18983 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18984 Res = DAG.getNOT(DL, Res, Res.getValueType());
18986 if (!isNullConstant(Op2))
18987 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18989 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18990 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18991 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18992 SDValue CmpOp0 = Cmp.getOperand(0);
18993 SDValue Src1, Src2;
18994 // true if Op2 is XOR or OR operator and one of its operands
18996 // ( a , a op b) || ( b , a op b)
18997 auto isOrXorPattern = [&]() {
18998 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18999 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
19001 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
19008 if (isOrXorPattern()) {
19010 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
19011 // we need mask of all zeros or ones with same size of the other
19013 if (CmpSz > VT.getSizeInBits())
19014 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
19015 else if (CmpSz < VT.getSizeInBits())
19016 Neg = DAG.getNode(ISD::AND, DL, VT,
19017 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
19018 DAG.getConstant(1, DL, VT));
19021 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
19022 Neg); // -(and (x, 0x1))
19023 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
19024 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
19029 // Look past (and (setcc_carry (cmp ...)), 1).
19030 if (Cond.getOpcode() == ISD::AND &&
19031 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19032 isOneConstant(Cond.getOperand(1)))
19033 Cond = Cond.getOperand(0);
19035 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19036 // setting operand in place of the X86ISD::SETCC.
19037 unsigned CondOpcode = Cond.getOpcode();
19038 if (CondOpcode == X86ISD::SETCC ||
19039 CondOpcode == X86ISD::SETCC_CARRY) {
19040 CC = Cond.getOperand(0);
19042 SDValue Cmp = Cond.getOperand(1);
19043 unsigned Opc = Cmp.getOpcode();
19044 MVT VT = Op.getSimpleValueType();
19046 bool IllegalFPCMov = false;
19047 if (VT.isFloatingPoint() && !VT.isVector() &&
19048 !isScalarFPTypeInSSEReg(VT)) // FPStack?
19049 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
19051 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
19052 Opc == X86ISD::BT) { // FIXME
19056 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19057 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19058 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19059 Cond.getOperand(0).getValueType() != MVT::i8)) {
19060 SDValue LHS = Cond.getOperand(0);
19061 SDValue RHS = Cond.getOperand(1);
19062 unsigned X86Opcode;
19065 switch (CondOpcode) {
19066 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19067 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19068 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19069 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19070 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19071 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19072 default: llvm_unreachable("unexpected overflowing operator");
19074 if (CondOpcode == ISD::UMULO)
19075 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19078 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19080 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
19082 if (CondOpcode == ISD::UMULO)
19083 Cond = X86Op.getValue(2);
19085 Cond = X86Op.getValue(1);
19087 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
19092 // Look past the truncate if the high bits are known zero.
19093 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19094 Cond = Cond.getOperand(0);
19096 // We know the result of AND is compared against zero. Try to match
19098 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19099 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
19100 CC = NewSetCC.getOperand(0);
19101 Cond = NewSetCC.getOperand(1);
19108 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
19109 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
19112 // a < b ? -1 : 0 -> RES = ~setcc_carry
19113 // a < b ? 0 : -1 -> RES = setcc_carry
19114 // a >= b ? -1 : 0 -> RES = setcc_carry
19115 // a >= b ? 0 : -1 -> RES = ~setcc_carry
19116 if (Cond.getOpcode() == X86ISD::SUB) {
19117 Cond = ConvertCmpIfNecessary(Cond, DAG);
19118 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
19120 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
19121 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
19122 (isNullConstant(Op1) || isNullConstant(Op2))) {
19123 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
19124 DAG.getConstant(X86::COND_B, DL, MVT::i8),
19126 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
19127 return DAG.getNOT(DL, Res, Res.getValueType());
19132 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
19133 // widen the cmov and push the truncate through. This avoids introducing a new
19134 // branch during isel and doesn't add any extensions.
19135 if (Op.getValueType() == MVT::i8 &&
19136 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
19137 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
19138 if (T1.getValueType() == T2.getValueType() &&
19139 // Blacklist CopyFromReg to avoid partial register stalls.
19140 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
19141 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
19143 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19147 // Promote i16 cmovs if it won't prevent folding a load.
19148 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
19149 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
19150 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
19151 SDValue Ops[] = { Op2, Op1, CC, Cond };
19152 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
19153 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19156 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
19157 // condition is true.
19158 SDValue Ops[] = { Op2, Op1, CC, Cond };
19159 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
19162 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
19163 const X86Subtarget &Subtarget,
19164 SelectionDAG &DAG) {
19165 MVT VT = Op->getSimpleValueType(0);
19166 SDValue In = Op->getOperand(0);
19167 MVT InVT = In.getSimpleValueType();
19168 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19169 MVT VTElt = VT.getVectorElementType();
19172 unsigned NumElts = VT.getVectorNumElements();
19174 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
19176 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
19177 // If v16i32 is to be avoided, we'll need to split and concatenate.
19178 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19179 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
19181 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19184 // Widen to 512-bits if VLX is not supported.
19185 MVT WideVT = ExtVT;
19186 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19187 NumElts *= 512 / ExtVT.getSizeInBits();
19188 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19189 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
19190 In, DAG.getIntPtrConstant(0, dl));
19191 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
19195 MVT WideEltVT = WideVT.getVectorElementType();
19196 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
19197 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
19198 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
19200 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
19201 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
19202 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
19205 // Truncate if we had to extend i16/i8 above.
19207 WideVT = MVT::getVectorVT(VTElt, NumElts);
19208 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
19211 // Extract back to 128/256-bit if we widened.
19213 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
19214 DAG.getIntPtrConstant(0, dl));
19219 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19220 SelectionDAG &DAG) {
19221 SDValue In = Op->getOperand(0);
19222 MVT InVT = In.getSimpleValueType();
19224 if (InVT.getVectorElementType() == MVT::i1)
19225 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19227 assert(Subtarget.hasAVX() && "Expected AVX support");
19228 return LowerAVXExtend(Op, DAG, Subtarget);
19231 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
19232 // For sign extend this needs to handle all vector sizes and SSE4.1 and
19233 // non-SSE4.1 targets. For zero extend this should only handle inputs of
19234 // MVT::v64i8 when BWI is not supported, but AVX512 is.
19235 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
19236 const X86Subtarget &Subtarget,
19237 SelectionDAG &DAG) {
19238 SDValue In = Op->getOperand(0);
19239 MVT VT = Op->getSimpleValueType(0);
19240 MVT InVT = In.getSimpleValueType();
19241 assert(VT.getSizeInBits() == InVT.getSizeInBits());
19243 MVT SVT = VT.getVectorElementType();
19244 MVT InSVT = InVT.getVectorElementType();
19245 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
19247 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
19249 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19251 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19252 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19253 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19258 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19259 // For 512-bit vectors, we need 128-bits or 256-bits.
19260 if (VT.getSizeInBits() > 128) {
19261 // Input needs to be at least the same number of elements as output, and
19262 // at least 128-bits.
19263 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19264 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19267 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19268 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19270 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19271 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19272 // need to be handled here for 256/512-bit results.
19273 if (Subtarget.hasInt256()) {
19274 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19275 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19276 X86ISD::VSEXT : X86ISD::VZEXT;
19277 return DAG.getNode(ExtOpc, dl, VT, In);
19280 // We should only get here for sign extend.
19281 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19282 "Unexpected opcode!");
19284 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19288 // As SRAI is only available on i16/i32 types, we expand only up to i32
19289 // and handle i64 separately.
19290 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19291 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19292 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19293 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19294 Curr = DAG.getBitcast(CurrVT, Curr);
19297 SDValue SignExt = Curr;
19298 if (CurrVT != InVT) {
19299 unsigned SignExtShift =
19300 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19301 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19302 DAG.getConstant(SignExtShift, dl, MVT::i8));
19308 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19309 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19310 DAG.getConstant(31, dl, MVT::i8));
19311 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19312 return DAG.getBitcast(VT, Ext);
19318 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19319 SelectionDAG &DAG) {
19320 MVT VT = Op->getSimpleValueType(0);
19321 SDValue In = Op->getOperand(0);
19322 MVT InVT = In.getSimpleValueType();
19325 if (InVT.getVectorElementType() == MVT::i1)
19326 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19328 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19329 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19330 "Expected same number of elements");
19331 assert((VT.getVectorElementType() == MVT::i16 ||
19332 VT.getVectorElementType() == MVT::i32 ||
19333 VT.getVectorElementType() == MVT::i64) &&
19334 "Unexpected element type");
19335 assert((InVT.getVectorElementType() == MVT::i8 ||
19336 InVT.getVectorElementType() == MVT::i16 ||
19337 InVT.getVectorElementType() == MVT::i32) &&
19338 "Unexpected element type");
19340 if (Subtarget.hasInt256())
19341 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19343 // Optimize vectors in AVX mode
19344 // Sign extend v8i16 to v8i32 and
19347 // Divide input vector into two parts
19348 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19349 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19350 // concat the vectors to original VT
19352 unsigned NumElems = InVT.getVectorNumElements();
19353 SDValue Undef = DAG.getUNDEF(InVT);
19355 SmallVector<int,8> ShufMask1(NumElems, -1);
19356 for (unsigned i = 0; i != NumElems/2; ++i)
19359 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19361 SmallVector<int,8> ShufMask2(NumElems, -1);
19362 for (unsigned i = 0; i != NumElems/2; ++i)
19363 ShufMask2[i] = i + NumElems/2;
19365 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19367 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19368 VT.getVectorNumElements() / 2);
19370 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19371 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19373 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19376 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19377 SelectionDAG &DAG) {
19378 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19380 SDValue StoredVal = St->getValue();
19382 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19383 assert(StoredVal.getValueType().isVector() &&
19384 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19385 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19387 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19388 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19389 "Expected AVX512F without AVX512DQI");
19391 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19392 DAG.getUNDEF(MVT::v8i1), StoredVal,
19393 DAG.getIntPtrConstant(0, dl));
19394 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19396 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19397 St->getPointerInfo(), St->getAlignment(),
19398 St->getMemOperand()->getFlags());
19401 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19402 // may emit an illegal shuffle but the expansion is still better than scalar
19403 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19404 // we'll emit a shuffle and a arithmetic shift.
19405 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19406 // TODO: It is possible to support ZExt by zeroing the undef values during
19407 // the shuffle phase or after the shuffle.
19408 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19409 SelectionDAG &DAG) {
19410 MVT RegVT = Op.getSimpleValueType();
19411 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19412 assert(RegVT.isInteger() &&
19413 "We only custom lower integer vector sext loads.");
19415 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19417 EVT MemVT = Ld->getMemoryVT();
19419 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19420 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19421 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19422 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19423 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19424 "Expected AVX512F without AVX512DQI");
19426 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19427 Ld->getPointerInfo(), Ld->getAlignment(),
19428 Ld->getMemOperand()->getFlags());
19430 // Replace chain users with the new chain.
19431 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19432 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19434 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19435 DAG.getBitcast(MVT::v8i1, NewLd),
19436 DAG.getIntPtrConstant(0, dl));
19437 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19440 // Nothing useful we can do without SSE2 shuffles.
19441 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19443 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19444 unsigned RegSz = RegVT.getSizeInBits();
19446 ISD::LoadExtType Ext = Ld->getExtensionType();
19448 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19449 && "Only anyext and sext are currently implemented.");
19450 assert(MemVT != RegVT && "Cannot extend to the same type");
19451 assert(MemVT.isVector() && "Must load a vector from memory");
19453 unsigned NumElems = RegVT.getVectorNumElements();
19454 unsigned MemSz = MemVT.getSizeInBits();
19455 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19457 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19458 // The only way in which we have a legal 256-bit vector result but not the
19459 // integer 256-bit operations needed to directly lower a sextload is if we
19460 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19461 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19462 // correctly legalized. We do this late to allow the canonical form of
19463 // sextload to persist throughout the rest of the DAG combiner -- it wants
19464 // to fold together any extensions it can, and so will fuse a sign_extend
19465 // of an sextload into a sextload targeting a wider value.
19467 if (MemSz == 128) {
19468 // Just switch this to a normal load.
19469 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19470 "it must be a legal 128-bit vector "
19472 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19473 Ld->getPointerInfo(), Ld->getAlignment(),
19474 Ld->getMemOperand()->getFlags());
19476 assert(MemSz < 128 &&
19477 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19478 // Do an sext load to a 128-bit vector type. We want to use the same
19479 // number of elements, but elements half as wide. This will end up being
19480 // recursively lowered by this routine, but will succeed as we definitely
19481 // have all the necessary features if we're using AVX1.
19483 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19484 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19486 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19487 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19488 Ld->getMemOperand()->getFlags());
19491 // Replace chain users with the new chain.
19492 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19493 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19495 // Finally, do a normal sign-extend to the desired register.
19496 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19499 // All sizes must be a power of two.
19500 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19501 "Non-power-of-two elements are not custom lowered!");
19503 // Attempt to load the original value using scalar loads.
19504 // Find the largest scalar type that divides the total loaded size.
19505 MVT SclrLoadTy = MVT::i8;
19506 for (MVT Tp : MVT::integer_valuetypes()) {
19507 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19512 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19513 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19515 SclrLoadTy = MVT::f64;
19517 // Calculate the number of scalar loads that we need to perform
19518 // in order to load our vector from memory.
19519 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19521 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19522 "Can only lower sext loads with a single scalar load!");
19524 unsigned loadRegZize = RegSz;
19525 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19528 // If we don't have BWI we won't be able to create the shuffle needed for
19530 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19531 MemVT == MVT::v8i8)
19534 // Represent our vector as a sequence of elements which are the
19535 // largest scalar that we can load.
19536 EVT LoadUnitVecVT = EVT::getVectorVT(
19537 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19539 // Represent the data using the same element type that is stored in
19540 // memory. In practice, we ''widen'' MemVT.
19542 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19543 loadRegZize / MemVT.getScalarSizeInBits());
19545 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19546 "Invalid vector type");
19548 // We can't shuffle using an illegal type.
19549 assert(TLI.isTypeLegal(WideVecVT) &&
19550 "We only lower types that form legal widened vector types");
19552 SmallVector<SDValue, 8> Chains;
19553 SDValue Ptr = Ld->getBasePtr();
19554 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19555 TLI.getPointerTy(DAG.getDataLayout()));
19556 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19558 for (unsigned i = 0; i < NumLoads; ++i) {
19559 // Perform a single load.
19560 SDValue ScalarLoad =
19561 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19562 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19563 Chains.push_back(ScalarLoad.getValue(1));
19564 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19565 // another round of DAGCombining.
19567 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19569 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19570 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19572 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19575 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19577 // Bitcast the loaded value to a vector of the original element type, in
19578 // the size of the target vector type.
19579 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19580 unsigned SizeRatio = RegSz / MemSz;
19582 if (Ext == ISD::SEXTLOAD) {
19583 // If we have SSE4.1, we can directly emit a VSEXT node.
19584 if (Subtarget.hasSSE41()) {
19585 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19586 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19590 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19592 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19593 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19595 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19596 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19600 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19601 MemVT == MVT::v8i8) {
19602 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19603 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19607 // Redistribute the loaded elements into the different locations.
19608 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19609 for (unsigned i = 0; i != NumElems; ++i)
19610 ShuffleVec[i * SizeRatio] = i;
19612 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19613 DAG.getUNDEF(WideVecVT), ShuffleVec);
19615 // Bitcast to the requested type.
19616 Shuff = DAG.getBitcast(RegVT, Shuff);
19617 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19621 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19622 /// each of which has no other use apart from the AND / OR.
19623 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19624 Opc = Op.getOpcode();
19625 if (Opc != ISD::OR && Opc != ISD::AND)
19627 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19628 Op.getOperand(0).hasOneUse() &&
19629 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19630 Op.getOperand(1).hasOneUse());
19633 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19634 /// SETCC node has a single use.
19635 static bool isXor1OfSetCC(SDValue Op) {
19636 if (Op.getOpcode() != ISD::XOR)
19638 if (isOneConstant(Op.getOperand(1)))
19639 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19640 Op.getOperand(0).hasOneUse();
19644 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19645 bool addTest = true;
19646 SDValue Chain = Op.getOperand(0);
19647 SDValue Cond = Op.getOperand(1);
19648 SDValue Dest = Op.getOperand(2);
19651 bool Inverted = false;
19653 if (Cond.getOpcode() == ISD::SETCC) {
19654 // Check for setcc([su]{add,sub,mul}o == 0).
19655 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19656 isNullConstant(Cond.getOperand(1)) &&
19657 Cond.getOperand(0).getResNo() == 1 &&
19658 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19659 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19660 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19661 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19662 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19663 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19665 Cond = Cond.getOperand(0);
19667 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19672 // FIXME: LowerXALUO doesn't handle these!!
19673 else if (Cond.getOpcode() == X86ISD::ADD ||
19674 Cond.getOpcode() == X86ISD::SUB ||
19675 Cond.getOpcode() == X86ISD::SMUL ||
19676 Cond.getOpcode() == X86ISD::UMUL)
19677 Cond = LowerXALUO(Cond, DAG);
19680 // Look pass (and (setcc_carry (cmp ...)), 1).
19681 if (Cond.getOpcode() == ISD::AND &&
19682 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19683 isOneConstant(Cond.getOperand(1)))
19684 Cond = Cond.getOperand(0);
19686 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19687 // setting operand in place of the X86ISD::SETCC.
19688 unsigned CondOpcode = Cond.getOpcode();
19689 if (CondOpcode == X86ISD::SETCC ||
19690 CondOpcode == X86ISD::SETCC_CARRY) {
19691 CC = Cond.getOperand(0);
19693 SDValue Cmp = Cond.getOperand(1);
19694 unsigned Opc = Cmp.getOpcode();
19695 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19696 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19700 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19704 // These can only come from an arithmetic instruction with overflow,
19705 // e.g. SADDO, UADDO.
19706 Cond = Cond.getOperand(1);
19712 CondOpcode = Cond.getOpcode();
19713 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19714 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19715 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19716 Cond.getOperand(0).getValueType() != MVT::i8)) {
19717 SDValue LHS = Cond.getOperand(0);
19718 SDValue RHS = Cond.getOperand(1);
19719 unsigned X86Opcode;
19722 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19723 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19725 switch (CondOpcode) {
19726 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19728 if (isOneConstant(RHS)) {
19729 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19732 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19733 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19735 if (isOneConstant(RHS)) {
19736 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19739 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19740 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19741 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19742 default: llvm_unreachable("unexpected overflowing operator");
19745 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19746 if (CondOpcode == ISD::UMULO)
19747 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19750 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19752 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19754 if (CondOpcode == ISD::UMULO)
19755 Cond = X86Op.getValue(2);
19757 Cond = X86Op.getValue(1);
19759 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19763 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19764 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19765 if (CondOpc == ISD::OR) {
19766 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19767 // two branches instead of an explicit OR instruction with a
19769 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19770 isX86LogicalCmp(Cmp)) {
19771 CC = Cond.getOperand(0).getOperand(0);
19772 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19773 Chain, Dest, CC, Cmp);
19774 CC = Cond.getOperand(1).getOperand(0);
19778 } else { // ISD::AND
19779 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19780 // two branches instead of an explicit AND instruction with a
19781 // separate test. However, we only do this if this block doesn't
19782 // have a fall-through edge, because this requires an explicit
19783 // jmp when the condition is false.
19784 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19785 isX86LogicalCmp(Cmp) &&
19786 Op.getNode()->hasOneUse()) {
19787 X86::CondCode CCode =
19788 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19789 CCode = X86::GetOppositeBranchCondition(CCode);
19790 CC = DAG.getConstant(CCode, dl, MVT::i8);
19791 SDNode *User = *Op.getNode()->use_begin();
19792 // Look for an unconditional branch following this conditional branch.
19793 // We need this because we need to reverse the successors in order
19794 // to implement FCMP_OEQ.
19795 if (User->getOpcode() == ISD::BR) {
19796 SDValue FalseBB = User->getOperand(1);
19798 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19799 assert(NewBR == User);
19803 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19804 Chain, Dest, CC, Cmp);
19805 X86::CondCode CCode =
19806 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19807 CCode = X86::GetOppositeBranchCondition(CCode);
19808 CC = DAG.getConstant(CCode, dl, MVT::i8);
19814 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19815 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19816 // It should be transformed during dag combiner except when the condition
19817 // is set by a arithmetics with overflow node.
19818 X86::CondCode CCode =
19819 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19820 CCode = X86::GetOppositeBranchCondition(CCode);
19821 CC = DAG.getConstant(CCode, dl, MVT::i8);
19822 Cond = Cond.getOperand(0).getOperand(1);
19824 } else if (Cond.getOpcode() == ISD::SETCC &&
19825 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19826 // For FCMP_OEQ, we can emit
19827 // two branches instead of an explicit AND instruction with a
19828 // separate test. However, we only do this if this block doesn't
19829 // have a fall-through edge, because this requires an explicit
19830 // jmp when the condition is false.
19831 if (Op.getNode()->hasOneUse()) {
19832 SDNode *User = *Op.getNode()->use_begin();
19833 // Look for an unconditional branch following this conditional branch.
19834 // We need this because we need to reverse the successors in order
19835 // to implement FCMP_OEQ.
19836 if (User->getOpcode() == ISD::BR) {
19837 SDValue FalseBB = User->getOperand(1);
19839 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19840 assert(NewBR == User);
19844 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19845 Cond.getOperand(0), Cond.getOperand(1));
19846 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19847 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19848 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19849 Chain, Dest, CC, Cmp);
19850 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19855 } else if (Cond.getOpcode() == ISD::SETCC &&
19856 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19857 // For FCMP_UNE, we can emit
19858 // two branches instead of an explicit AND instruction with a
19859 // separate test. However, we only do this if this block doesn't
19860 // have a fall-through edge, because this requires an explicit
19861 // jmp when the condition is false.
19862 if (Op.getNode()->hasOneUse()) {
19863 SDNode *User = *Op.getNode()->use_begin();
19864 // Look for an unconditional branch following this conditional branch.
19865 // We need this because we need to reverse the successors in order
19866 // to implement FCMP_UNE.
19867 if (User->getOpcode() == ISD::BR) {
19868 SDValue FalseBB = User->getOperand(1);
19870 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19871 assert(NewBR == User);
19874 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19875 Cond.getOperand(0), Cond.getOperand(1));
19876 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19877 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19878 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19879 Chain, Dest, CC, Cmp);
19880 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19890 // Look pass the truncate if the high bits are known zero.
19891 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19892 Cond = Cond.getOperand(0);
19894 // We know the result of AND is compared against zero. Try to match
19896 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19897 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19898 CC = NewSetCC.getOperand(0);
19899 Cond = NewSetCC.getOperand(1);
19906 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19907 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19908 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19910 Cond = ConvertCmpIfNecessary(Cond, DAG);
19911 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19912 Chain, Dest, CC, Cond);
19915 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19916 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19917 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19918 // that the guard pages used by the OS virtual memory manager are allocated in
19919 // correct sequence.
19921 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19922 SelectionDAG &DAG) const {
19923 MachineFunction &MF = DAG.getMachineFunction();
19924 bool SplitStack = MF.shouldSplitStack();
19925 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19926 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19927 SplitStack || EmitStackProbe;
19931 SDNode *Node = Op.getNode();
19932 SDValue Chain = Op.getOperand(0);
19933 SDValue Size = Op.getOperand(1);
19934 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19935 EVT VT = Node->getValueType(0);
19937 // Chain the dynamic stack allocation so that it doesn't modify the stack
19938 // pointer when other instructions are using the stack.
19939 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19941 bool Is64Bit = Subtarget.is64Bit();
19942 MVT SPTy = getPointerTy(DAG.getDataLayout());
19946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19947 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19948 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19949 " not tell us which reg is the stack pointer!");
19951 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19952 Chain = SP.getValue(1);
19953 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19954 unsigned StackAlign = TFI.getStackAlignment();
19955 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19956 if (Align > StackAlign)
19957 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19958 DAG.getConstant(-(uint64_t)Align, dl, VT));
19959 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19960 } else if (SplitStack) {
19961 MachineRegisterInfo &MRI = MF.getRegInfo();
19964 // The 64 bit implementation of segmented stacks needs to clobber both r10
19965 // r11. This makes it impossible to use it along with nested parameters.
19966 const Function &F = MF.getFunction();
19967 for (const auto &A : F.args()) {
19968 if (A.hasNestAttr())
19969 report_fatal_error("Cannot use segmented stacks with functions that "
19970 "have nested arguments.");
19974 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19975 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19976 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19977 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19978 DAG.getRegister(Vreg, SPTy));
19980 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19981 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19982 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19984 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19985 unsigned SPReg = RegInfo->getStackRegister();
19986 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19987 Chain = SP.getValue(1);
19990 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19991 DAG.getConstant(-(uint64_t)Align, dl, VT));
19992 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19998 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19999 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
20001 SDValue Ops[2] = {Result, Chain};
20002 return DAG.getMergeValues(Ops, dl);
20005 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
20006 MachineFunction &MF = DAG.getMachineFunction();
20007 auto PtrVT = getPointerTy(MF.getDataLayout());
20008 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20010 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20013 if (!Subtarget.is64Bit() ||
20014 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
20015 // vastart just stores the address of the VarArgsFrameIndex slot into the
20016 // memory location argument.
20017 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20018 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
20019 MachinePointerInfo(SV));
20023 // gp_offset (0 - 6 * 8)
20024 // fp_offset (48 - 48 + 8 * 16)
20025 // overflow_arg_area (point to parameters coming in memory).
20027 SmallVector<SDValue, 8> MemOps;
20028 SDValue FIN = Op.getOperand(1);
20030 SDValue Store = DAG.getStore(
20031 Op.getOperand(0), DL,
20032 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
20033 MachinePointerInfo(SV));
20034 MemOps.push_back(Store);
20037 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
20038 Store = DAG.getStore(
20039 Op.getOperand(0), DL,
20040 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
20041 MachinePointerInfo(SV, 4));
20042 MemOps.push_back(Store);
20044 // Store ptr to overflow_arg_area
20045 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
20046 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20048 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
20049 MemOps.push_back(Store);
20051 // Store ptr to reg_save_area.
20052 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
20053 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
20054 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
20055 Store = DAG.getStore(
20056 Op.getOperand(0), DL, RSFIN, FIN,
20057 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
20058 MemOps.push_back(Store);
20059 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
20062 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
20063 assert(Subtarget.is64Bit() &&
20064 "LowerVAARG only handles 64-bit va_arg!");
20065 assert(Op.getNumOperands() == 4);
20067 MachineFunction &MF = DAG.getMachineFunction();
20068 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
20069 // The Win64 ABI uses char* instead of a structure.
20070 return DAG.expandVAArg(Op.getNode());
20072 SDValue Chain = Op.getOperand(0);
20073 SDValue SrcPtr = Op.getOperand(1);
20074 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20075 unsigned Align = Op.getConstantOperandVal(3);
20078 EVT ArgVT = Op.getNode()->getValueType(0);
20079 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20080 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
20083 // Decide which area this value should be read from.
20084 // TODO: Implement the AMD64 ABI in its entirety. This simple
20085 // selection mechanism works only for the basic types.
20086 if (ArgVT == MVT::f80) {
20087 llvm_unreachable("va_arg for f80 not yet implemented");
20088 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
20089 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
20090 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
20091 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
20093 llvm_unreachable("Unhandled argument type in LowerVAARG");
20096 if (ArgMode == 2) {
20097 // Sanity Check: Make sure using fp_offset makes sense.
20098 assert(!Subtarget.useSoftFloat() &&
20099 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
20100 Subtarget.hasSSE1());
20103 // Insert VAARG_64 node into the DAG
20104 // VAARG_64 returns two values: Variable Argument Address, Chain
20105 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
20106 DAG.getConstant(ArgMode, dl, MVT::i8),
20107 DAG.getConstant(Align, dl, MVT::i32)};
20108 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
20109 SDValue VAARG = DAG.getMemIntrinsicNode(
20110 X86ISD::VAARG_64, dl,
20111 VTs, InstOps, MVT::i64,
20112 MachinePointerInfo(SV),
20114 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
20115 Chain = VAARG.getValue(1);
20117 // Load the next argument and return it
20118 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
20121 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
20122 SelectionDAG &DAG) {
20123 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
20124 // where a va_list is still an i8*.
20125 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
20126 if (Subtarget.isCallingConvWin64(
20127 DAG.getMachineFunction().getFunction().getCallingConv()))
20128 // Probably a Win64 va_copy.
20129 return DAG.expandVACopy(Op.getNode());
20131 SDValue Chain = Op.getOperand(0);
20132 SDValue DstPtr = Op.getOperand(1);
20133 SDValue SrcPtr = Op.getOperand(2);
20134 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
20135 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20138 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
20139 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
20141 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
20144 /// Handle vector element shifts where the shift amount is a constant.
20145 /// Takes immediate version of shift as input.
20146 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
20147 SDValue SrcOp, uint64_t ShiftAmt,
20148 SelectionDAG &DAG) {
20149 MVT ElementType = VT.getVectorElementType();
20151 // Bitcast the source vector to the output type, this is mainly necessary for
20152 // vXi8/vXi64 shifts.
20153 if (VT != SrcOp.getSimpleValueType())
20154 SrcOp = DAG.getBitcast(VT, SrcOp);
20156 // Fold this packed shift into its first operand if ShiftAmt is 0.
20160 // Check for ShiftAmt >= element width
20161 if (ShiftAmt >= ElementType.getSizeInBits()) {
20162 if (Opc == X86ISD::VSRAI)
20163 ShiftAmt = ElementType.getSizeInBits() - 1;
20165 return DAG.getConstant(0, dl, VT);
20168 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
20169 && "Unknown target vector shift-by-constant node");
20171 // Fold this packed vector shift into a build vector if SrcOp is a
20172 // vector of Constants or UNDEFs.
20173 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
20174 SmallVector<SDValue, 8> Elts;
20175 unsigned NumElts = SrcOp->getNumOperands();
20176 ConstantSDNode *ND;
20179 default: llvm_unreachable("Unknown opcode!");
20180 case X86ISD::VSHLI:
20181 for (unsigned i=0; i!=NumElts; ++i) {
20182 SDValue CurrentOp = SrcOp->getOperand(i);
20183 if (CurrentOp->isUndef()) {
20184 Elts.push_back(CurrentOp);
20187 ND = cast<ConstantSDNode>(CurrentOp);
20188 const APInt &C = ND->getAPIntValue();
20189 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
20192 case X86ISD::VSRLI:
20193 for (unsigned i=0; i!=NumElts; ++i) {
20194 SDValue CurrentOp = SrcOp->getOperand(i);
20195 if (CurrentOp->isUndef()) {
20196 Elts.push_back(CurrentOp);
20199 ND = cast<ConstantSDNode>(CurrentOp);
20200 const APInt &C = ND->getAPIntValue();
20201 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
20204 case X86ISD::VSRAI:
20205 for (unsigned i=0; i!=NumElts; ++i) {
20206 SDValue CurrentOp = SrcOp->getOperand(i);
20207 if (CurrentOp->isUndef()) {
20208 Elts.push_back(CurrentOp);
20211 ND = cast<ConstantSDNode>(CurrentOp);
20212 const APInt &C = ND->getAPIntValue();
20213 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
20218 return DAG.getBuildVector(VT, dl, Elts);
20221 return DAG.getNode(Opc, dl, VT, SrcOp,
20222 DAG.getConstant(ShiftAmt, dl, MVT::i8));
20225 /// Handle vector element shifts where the shift amount may or may not be a
20226 /// constant. Takes immediate version of shift as input.
20227 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
20228 SDValue SrcOp, SDValue ShAmt,
20229 const X86Subtarget &Subtarget,
20230 SelectionDAG &DAG) {
20231 MVT SVT = ShAmt.getSimpleValueType();
20232 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
20234 // Catch shift-by-constant.
20235 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
20236 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
20237 CShAmt->getZExtValue(), DAG);
20239 // Change opcode to non-immediate version
20241 default: llvm_unreachable("Unknown target vector shift node");
20242 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
20243 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
20244 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
20247 // Need to build a vector containing shift amount.
20248 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
20249 // +=================+============+=======================================+
20250 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20251 // +=================+============+=======================================+
20252 // | i64 | Yes, No | Use ShAmt as lowest elt |
20253 // | i32 | Yes | zero-extend in-reg |
20254 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20255 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20256 // +=================+============+=======================================+
20258 if (SVT == MVT::i64)
20259 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20260 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20261 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20262 ShAmt = ShAmt.getOperand(0);
20263 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20264 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20265 } else if (Subtarget.hasSSE41() &&
20266 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20267 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20268 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20270 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20271 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20272 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20275 // The return type has to be a 128-bit type with the same element
20276 // type as the input type.
20277 MVT EltVT = VT.getVectorElementType();
20278 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20280 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20281 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20284 /// Return Mask with the necessary casting or extending
20285 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20286 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20287 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20290 if (isAllOnesConstant(Mask))
20291 return DAG.getConstant(1, dl, MaskVT);
20292 if (X86::isZeroNode(Mask))
20293 return DAG.getConstant(0, dl, MaskVT);
20295 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20296 // Mask should be extended
20297 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20298 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20301 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20302 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20303 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20304 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20306 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20307 DAG.getConstant(0, dl, MVT::i32));
20308 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20309 DAG.getConstant(1, dl, MVT::i32));
20311 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20312 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20314 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20316 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20317 Mask.getSimpleValueType().getSizeInBits());
20318 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20319 // are extracted by EXTRACT_SUBVECTOR.
20320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20321 DAG.getBitcast(BitcastVT, Mask),
20322 DAG.getIntPtrConstant(0, dl));
20326 /// Return (and \p Op, \p Mask) for compare instructions or
20327 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20328 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20329 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20330 SDValue PreservedSrc,
20331 const X86Subtarget &Subtarget,
20332 SelectionDAG &DAG) {
20333 MVT VT = Op.getSimpleValueType();
20334 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20335 unsigned OpcodeSelect = ISD::VSELECT;
20338 if (isAllOnesConstant(Mask))
20341 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20343 switch (Op.getOpcode()) {
20346 case X86ISD::CMPM_RND:
20347 case X86ISD::VPSHUFBITQMB:
20348 case X86ISD::VFPCLASS:
20349 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20350 case ISD::TRUNCATE:
20351 case X86ISD::VTRUNC:
20352 case X86ISD::VTRUNCS:
20353 case X86ISD::VTRUNCUS:
20354 case X86ISD::CVTPS2PH:
20355 // We can't use ISD::VSELECT here because it is not always "Legal"
20356 // for the destination type. For example vpmovqb require only AVX512
20357 // and vselect that can operate on byte element type require BWI
20358 OpcodeSelect = X86ISD::SELECT;
20361 if (PreservedSrc.isUndef())
20362 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20363 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20366 /// Creates an SDNode for a predicated scalar operation.
20367 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20368 /// The mask is coming as MVT::i8 and it should be transformed
20369 /// to MVT::v1i1 while lowering masking intrinsics.
20370 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20371 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20372 /// for a scalar instruction.
20373 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20374 SDValue PreservedSrc,
20375 const X86Subtarget &Subtarget,
20376 SelectionDAG &DAG) {
20378 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20379 if (MaskConst->getZExtValue() & 0x1)
20382 MVT VT = Op.getSimpleValueType();
20385 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20386 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20387 if (Op.getOpcode() == X86ISD::FSETCCM ||
20388 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20389 Op.getOpcode() == X86ISD::VFPCLASSS)
20390 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20392 if (PreservedSrc.isUndef())
20393 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20394 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20397 static int getSEHRegistrationNodeSize(const Function *Fn) {
20398 if (!Fn->hasPersonalityFn())
20399 report_fatal_error(
20400 "querying registration node size for function without personality");
20401 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20402 // WinEHStatePass for the full struct definition.
20403 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20404 case EHPersonality::MSVC_X86SEH: return 24;
20405 case EHPersonality::MSVC_CXX: return 16;
20408 report_fatal_error(
20409 "can only recover FP for 32-bit MSVC EH personality functions");
20412 /// When the MSVC runtime transfers control to us, either to an outlined
20413 /// function or when returning to a parent frame after catching an exception, we
20414 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20415 /// Here's the math:
20416 /// RegNodeBase = EntryEBP - RegNodeSize
20417 /// ParentFP = RegNodeBase - ParentFrameOffset
20418 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20419 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20420 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20421 SDValue EntryEBP) {
20422 MachineFunction &MF = DAG.getMachineFunction();
20425 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20426 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20428 // It's possible that the parent function no longer has a personality function
20429 // if the exceptional code was optimized away, in which case we just return
20430 // the incoming EBP.
20431 if (!Fn->hasPersonalityFn())
20434 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20435 // registration, or the .set_setframe offset.
20436 MCSymbol *OffsetSym =
20437 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20438 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20439 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20440 SDValue ParentFrameOffset =
20441 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20443 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20444 // prologue to RBP in the parent function.
20445 const X86Subtarget &Subtarget =
20446 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20447 if (Subtarget.is64Bit())
20448 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20450 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20451 // RegNodeBase = EntryEBP - RegNodeSize
20452 // ParentFP = RegNodeBase - ParentFrameOffset
20453 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20454 DAG.getConstant(RegNodeSize, dl, PtrVT));
20455 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20458 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20459 SelectionDAG &DAG) const {
20460 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20461 auto isRoundModeCurDirection = [](SDValue Rnd) {
20462 if (!isa<ConstantSDNode>(Rnd))
20465 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20466 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20470 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20471 MVT VT = Op.getSimpleValueType();
20472 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20474 switch(IntrData->Type) {
20475 case INTR_TYPE_1OP: {
20476 // We specify 2 possible opcodes for intrinsics with rounding modes.
20477 // First, we check if the intrinsic may have non-default rounding mode,
20478 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20479 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20480 if (IntrWithRoundingModeOpcode != 0) {
20481 SDValue Rnd = Op.getOperand(2);
20482 if (!isRoundModeCurDirection(Rnd)) {
20483 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20484 Op.getOperand(1), Rnd);
20487 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20489 case INTR_TYPE_2OP:
20490 case INTR_TYPE_2OP_IMM8: {
20491 SDValue Src2 = Op.getOperand(2);
20493 if (IntrData->Type == INTR_TYPE_2OP_IMM8)
20494 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20496 // We specify 2 possible opcodes for intrinsics with rounding modes.
20497 // First, we check if the intrinsic may have non-default rounding mode,
20498 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20499 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20500 if (IntrWithRoundingModeOpcode != 0) {
20501 SDValue Rnd = Op.getOperand(3);
20502 if (!isRoundModeCurDirection(Rnd)) {
20503 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20504 Op.getOperand(1), Src2, Rnd);
20508 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20509 Op.getOperand(1), Src2);
20511 case INTR_TYPE_3OP:
20512 case INTR_TYPE_3OP_IMM8: {
20513 SDValue Src1 = Op.getOperand(1);
20514 SDValue Src2 = Op.getOperand(2);
20515 SDValue Src3 = Op.getOperand(3);
20517 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
20518 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20520 // We specify 2 possible opcodes for intrinsics with rounding modes.
20521 // First, we check if the intrinsic may have non-default rounding mode,
20522 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20523 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20524 if (IntrWithRoundingModeOpcode != 0) {
20525 SDValue Rnd = Op.getOperand(4);
20526 if (!isRoundModeCurDirection(Rnd)) {
20527 return DAG.getNode(IntrWithRoundingModeOpcode,
20528 dl, Op.getValueType(),
20529 Src1, Src2, Src3, Rnd);
20533 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20536 case INTR_TYPE_4OP:
20537 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20538 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20539 case INTR_TYPE_1OP_MASK_RM: {
20540 SDValue Src = Op.getOperand(1);
20541 SDValue PassThru = Op.getOperand(2);
20542 SDValue Mask = Op.getOperand(3);
20543 SDValue RoundingMode;
20544 // We always add rounding mode to the Node.
20545 // If the rounding mode is not specified, we add the
20546 // "current direction" mode.
20547 if (Op.getNumOperands() == 4)
20549 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20551 RoundingMode = Op.getOperand(4);
20552 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20553 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20555 Mask, PassThru, Subtarget, DAG);
20557 case INTR_TYPE_1OP_MASK: {
20558 SDValue Src = Op.getOperand(1);
20559 SDValue PassThru = Op.getOperand(2);
20560 SDValue Mask = Op.getOperand(3);
20561 // We add rounding mode to the Node when
20562 // - RM Opcode is specified and
20563 // - RM is not "current direction".
20564 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20565 if (IntrWithRoundingModeOpcode != 0) {
20566 SDValue Rnd = Op.getOperand(4);
20567 if (!isRoundModeCurDirection(Rnd)) {
20568 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20569 dl, Op.getValueType(),
20571 Mask, PassThru, Subtarget, DAG);
20574 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20575 Mask, PassThru, Subtarget, DAG);
20577 case INTR_TYPE_SCALAR_MASK: {
20578 SDValue Src1 = Op.getOperand(1);
20579 SDValue Src2 = Op.getOperand(2);
20580 SDValue passThru = Op.getOperand(3);
20581 SDValue Mask = Op.getOperand(4);
20582 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20583 // There are 2 kinds of intrinsics in this group:
20584 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20585 // (2) With rounding mode and sae - 7 operands.
20586 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20587 if (Op.getNumOperands() == (5U + HasRounding)) {
20589 SDValue Rnd = Op.getOperand(5);
20590 if (!isRoundModeCurDirection(Rnd))
20591 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20592 dl, VT, Src1, Src2, Rnd),
20593 Mask, passThru, Subtarget, DAG);
20595 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20597 Mask, passThru, Subtarget, DAG);
20600 assert(Op.getNumOperands() == (6U + HasRounding) &&
20601 "Unexpected intrinsic form");
20602 SDValue RoundingMode = Op.getOperand(5);
20604 SDValue Sae = Op.getOperand(6);
20605 if (!isRoundModeCurDirection(Sae))
20606 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20607 dl, VT, Src1, Src2,
20608 RoundingMode, Sae),
20609 Mask, passThru, Subtarget, DAG);
20611 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20612 Src2, RoundingMode),
20613 Mask, passThru, Subtarget, DAG);
20615 case INTR_TYPE_SCALAR_MASK_RM: {
20616 SDValue Src1 = Op.getOperand(1);
20617 SDValue Src2 = Op.getOperand(2);
20618 SDValue Src0 = Op.getOperand(3);
20619 SDValue Mask = Op.getOperand(4);
20620 // There are 2 kinds of intrinsics in this group:
20621 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20622 // (2) With rounding mode and sae - 7 operands.
20623 if (Op.getNumOperands() == 6) {
20624 SDValue Sae = Op.getOperand(5);
20625 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20627 Mask, Src0, Subtarget, DAG);
20629 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20630 SDValue RoundingMode = Op.getOperand(5);
20631 SDValue Sae = Op.getOperand(6);
20632 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20633 RoundingMode, Sae),
20634 Mask, Src0, Subtarget, DAG);
20636 case INTR_TYPE_2OP_MASK: {
20637 SDValue Src1 = Op.getOperand(1);
20638 SDValue Src2 = Op.getOperand(2);
20639 SDValue PassThru = Op.getOperand(3);
20640 SDValue Mask = Op.getOperand(4);
20642 // We specify 2 possible opcodes for intrinsics with rounding modes.
20643 // First, we check if the intrinsic may have non-default rounding mode,
20644 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20645 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20646 if (IntrWithRoundingModeOpcode != 0) {
20647 SDValue Rnd = Op.getOperand(5);
20648 if (!isRoundModeCurDirection(Rnd)) {
20649 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20650 dl, Op.getValueType(),
20652 Mask, PassThru, Subtarget, DAG);
20655 // TODO: Intrinsics should have fast-math-flags to propagate.
20656 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20657 Mask, PassThru, Subtarget, DAG);
20659 case INTR_TYPE_2OP_MASK_RM: {
20660 SDValue Src1 = Op.getOperand(1);
20661 SDValue Src2 = Op.getOperand(2);
20662 SDValue PassThru = Op.getOperand(3);
20663 SDValue Mask = Op.getOperand(4);
20664 // We specify 2 possible modes for intrinsics, with/without rounding
20666 // First, we check if the intrinsic have rounding mode (6 operands),
20667 // if not, we set rounding mode to "current".
20669 if (Op.getNumOperands() == 6)
20670 Rnd = Op.getOperand(5);
20672 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20673 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20675 Mask, PassThru, Subtarget, DAG);
20677 case INTR_TYPE_3OP_SCALAR_MASK: {
20678 SDValue Src1 = Op.getOperand(1);
20679 SDValue Src2 = Op.getOperand(2);
20680 SDValue Src3 = Op.getOperand(3);
20681 SDValue PassThru = Op.getOperand(4);
20682 SDValue Mask = Op.getOperand(5);
20684 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20685 if (IntrWithRoundingModeOpcode != 0) {
20686 SDValue Rnd = Op.getOperand(6);
20687 if (!isRoundModeCurDirection(Rnd))
20688 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20689 dl, VT, Src1, Src2, Src3, Rnd),
20690 Mask, PassThru, Subtarget, DAG);
20692 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20694 Mask, PassThru, Subtarget, DAG);
20696 case INTR_TYPE_3OP_MASK: {
20697 SDValue Src1 = Op.getOperand(1);
20698 SDValue Src2 = Op.getOperand(2);
20699 SDValue Src3 = Op.getOperand(3);
20700 SDValue PassThru = Op.getOperand(4);
20701 SDValue Mask = Op.getOperand(5);
20703 // We specify 2 possible opcodes for intrinsics with rounding modes.
20704 // First, we check if the intrinsic may have non-default rounding mode,
20705 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20706 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20707 if (IntrWithRoundingModeOpcode != 0) {
20708 SDValue Rnd = Op.getOperand(6);
20709 if (!isRoundModeCurDirection(Rnd)) {
20710 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20711 dl, Op.getValueType(),
20712 Src1, Src2, Src3, Rnd),
20713 Mask, PassThru, Subtarget, DAG);
20716 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20718 Mask, PassThru, Subtarget, DAG);
20721 SDValue Src1 = Op.getOperand(1);
20722 SDValue Src2 = Op.getOperand(2);
20724 // Swap Src1 and Src2 in the node creation
20725 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
20728 case FMA_OP_MASK: {
20729 SDValue Src1 = Op.getOperand(1);
20730 SDValue Src2 = Op.getOperand(2);
20731 SDValue Src3 = Op.getOperand(3);
20732 SDValue Mask = Op.getOperand(4);
20733 MVT VT = Op.getSimpleValueType();
20734 SDValue PassThru = SDValue();
20736 // set PassThru element
20737 if (IntrData->Type == FMA_OP_MASKZ)
20738 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20742 // We specify 2 possible opcodes for intrinsics with rounding modes.
20743 // First, we check if the intrinsic may have non-default rounding mode,
20744 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20745 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20746 if (IntrWithRoundingModeOpcode != 0) {
20747 SDValue Rnd = Op.getOperand(5);
20748 if (!isRoundModeCurDirection(Rnd))
20749 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20750 dl, Op.getValueType(),
20751 Src1, Src2, Src3, Rnd),
20752 Mask, PassThru, Subtarget, DAG);
20754 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20755 dl, Op.getValueType(),
20757 Mask, PassThru, Subtarget, DAG);
20760 // NOTE: We need to swizzle the operands to pass the multiply operands
20762 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20763 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
20765 // ISD::FP_ROUND has a second argument that indicates if the truncation
20766 // does not change the value. Set it to 0 since it can change.
20767 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20768 DAG.getIntPtrConstant(0, dl));
20769 case CVTPD2PS_MASK: {
20770 SDValue Src = Op.getOperand(1);
20771 SDValue PassThru = Op.getOperand(2);
20772 SDValue Mask = Op.getOperand(3);
20773 // We add rounding mode to the Node when
20774 // - RM Opcode is specified and
20775 // - RM is not "current direction".
20776 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20777 if (IntrWithRoundingModeOpcode != 0) {
20778 SDValue Rnd = Op.getOperand(4);
20779 if (!isRoundModeCurDirection(Rnd)) {
20780 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20781 dl, Op.getValueType(),
20783 Mask, PassThru, Subtarget, DAG);
20786 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20787 // ISD::FP_ROUND has a second argument that indicates if the truncation
20788 // does not change the value. Set it to 0 since it can change.
20789 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20790 DAG.getIntPtrConstant(0, dl)),
20791 Mask, PassThru, Subtarget, DAG);
20794 // FPclass intrinsics
20795 SDValue Src1 = Op.getOperand(1);
20796 MVT MaskVT = Op.getSimpleValueType();
20797 SDValue Imm = Op.getOperand(2);
20798 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20801 SDValue Src1 = Op.getOperand(1);
20802 SDValue Imm = Op.getOperand(2);
20803 SDValue Mask = Op.getOperand(3);
20804 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20805 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20807 // Need to fill with zeros to ensure the bitcast will produce zeroes
20808 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20809 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20810 DAG.getConstant(0, dl, MVT::v8i1),
20811 FPclassMask, DAG.getIntPtrConstant(0, dl));
20812 return DAG.getBitcast(MVT::i8, Ins);
20815 // Comparison intrinsics with masks.
20816 // Example of transformation:
20817 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20818 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20820 // (v8i1 (insert_subvector zero,
20821 // (v2i1 (and (PCMPEQM %a, %b),
20822 // (extract_subvector
20823 // (v8i1 (bitcast %mask)), 0))), 0))))
20824 MVT VT = Op.getOperand(1).getSimpleValueType();
20825 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20826 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20827 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20828 Mask.getSimpleValueType().getSizeInBits());
20829 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20831 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20833 // Need to fill with zeros to ensure the bitcast will produce zeroes
20834 // for the upper bits in the v2i1/v4i1 case.
20835 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20836 DAG.getConstant(0, dl, BitcastVT),
20837 CmpMask, DAG.getIntPtrConstant(0, dl));
20838 return DAG.getBitcast(Op.getValueType(), Res);
20841 case CMP_MASK_CC: {
20842 MVT MaskVT = Op.getSimpleValueType();
20844 SDValue CC = Op.getOperand(3);
20845 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20846 // We specify 2 possible opcodes for intrinsics with rounding modes.
20847 // First, we check if the intrinsic may have non-default rounding mode,
20848 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20849 if (IntrData->Opc1 != 0) {
20850 SDValue Rnd = Op.getOperand(4);
20851 if (!isRoundModeCurDirection(Rnd))
20852 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20853 Op.getOperand(2), CC, Rnd);
20855 //default rounding mode
20856 if (!Cmp.getNode())
20857 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20858 Op.getOperand(2), CC);
20862 case CMP_MASK_SCALAR_CC: {
20863 SDValue Src1 = Op.getOperand(1);
20864 SDValue Src2 = Op.getOperand(2);
20865 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20866 SDValue Mask = Op.getOperand(4);
20869 if (IntrData->Opc1 != 0) {
20870 SDValue Rnd = Op.getOperand(5);
20871 if (!isRoundModeCurDirection(Rnd))
20872 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20874 //default rounding mode
20876 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20878 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20880 // Need to fill with zeros to ensure the bitcast will produce zeroes
20881 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20882 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20883 DAG.getConstant(0, dl, MVT::v8i1),
20884 CmpMask, DAG.getIntPtrConstant(0, dl));
20885 return DAG.getBitcast(MVT::i8, Ins);
20887 case COMI: { // Comparison intrinsics
20888 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20889 SDValue LHS = Op.getOperand(1);
20890 SDValue RHS = Op.getOperand(2);
20891 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20892 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20895 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20896 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20897 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20898 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20901 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20902 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20903 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20904 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20907 case ISD::SETGT: // (CF = 0 and ZF = 0)
20908 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20910 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20911 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20914 case ISD::SETGE: // CF = 0
20915 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20917 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20918 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20921 llvm_unreachable("Unexpected illegal condition!");
20923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20925 case COMI_RM: { // Comparison intrinsics with Sae
20926 SDValue LHS = Op.getOperand(1);
20927 SDValue RHS = Op.getOperand(2);
20928 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20929 SDValue Sae = Op.getOperand(4);
20932 if (isRoundModeCurDirection(Sae))
20933 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20934 DAG.getConstant(CondVal, dl, MVT::i8));
20936 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20937 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20938 // Need to fill with zeros to ensure the bitcast will produce zeroes
20939 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20940 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20941 DAG.getConstant(0, dl, MVT::v16i1),
20942 FCmp, DAG.getIntPtrConstant(0, dl));
20943 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20944 DAG.getBitcast(MVT::i16, Ins));
20947 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20948 Op.getOperand(1), Op.getOperand(2), Subtarget,
20950 case COMPRESS_EXPAND_IN_REG: {
20951 SDValue Mask = Op.getOperand(3);
20952 SDValue DataToCompress = Op.getOperand(1);
20953 SDValue PassThru = Op.getOperand(2);
20954 if (isAllOnesConstant(Mask)) // return data as is
20955 return Op.getOperand(1);
20957 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20959 Mask, PassThru, Subtarget, DAG);
20962 case FIXUPIMMS_MASKZ:
20964 case FIXUPIMM_MASKZ:{
20965 SDValue Src1 = Op.getOperand(1);
20966 SDValue Src2 = Op.getOperand(2);
20967 SDValue Src3 = Op.getOperand(3);
20968 SDValue Imm = Op.getOperand(4);
20969 SDValue Mask = Op.getOperand(5);
20970 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20971 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20972 // We specify 2 possible modes for intrinsics, with/without rounding
20974 // First, we check if the intrinsic have rounding mode (7 operands),
20975 // if not, we set rounding mode to "current".
20977 if (Op.getNumOperands() == 7)
20978 Rnd = Op.getOperand(6);
20980 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20981 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20982 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20983 Src1, Src2, Src3, Imm, Rnd),
20984 Mask, Passthru, Subtarget, DAG);
20985 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20986 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20987 Src1, Src2, Src3, Imm, Rnd),
20988 Mask, Passthru, Subtarget, DAG);
20991 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20992 // Clear the upper bits of the rounding immediate so that the legacy
20993 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20994 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20996 DAG.getConstant(0xf, dl, MVT::i32));
20997 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20998 Op.getOperand(1), RoundingMode);
21001 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
21002 // Clear the upper bits of the rounding immediate so that the legacy
21003 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
21004 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
21006 DAG.getConstant(0xf, dl, MVT::i32));
21007 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
21008 Op.getOperand(1), Op.getOperand(2), RoundingMode);
21016 default: return SDValue(); // Don't custom lower most intrinsics.
21018 // ptest and testp intrinsics. The intrinsic these come from are designed to
21019 // return an integer value, not just an instruction so lower it to the ptest
21020 // or testp pattern and a setcc for the result.
21021 case Intrinsic::x86_sse41_ptestz:
21022 case Intrinsic::x86_sse41_ptestc:
21023 case Intrinsic::x86_sse41_ptestnzc:
21024 case Intrinsic::x86_avx_ptestz_256:
21025 case Intrinsic::x86_avx_ptestc_256:
21026 case Intrinsic::x86_avx_ptestnzc_256:
21027 case Intrinsic::x86_avx_vtestz_ps:
21028 case Intrinsic::x86_avx_vtestc_ps:
21029 case Intrinsic::x86_avx_vtestnzc_ps:
21030 case Intrinsic::x86_avx_vtestz_pd:
21031 case Intrinsic::x86_avx_vtestc_pd:
21032 case Intrinsic::x86_avx_vtestnzc_pd:
21033 case Intrinsic::x86_avx_vtestz_ps_256:
21034 case Intrinsic::x86_avx_vtestc_ps_256:
21035 case Intrinsic::x86_avx_vtestnzc_ps_256:
21036 case Intrinsic::x86_avx_vtestz_pd_256:
21037 case Intrinsic::x86_avx_vtestc_pd_256:
21038 case Intrinsic::x86_avx_vtestnzc_pd_256: {
21039 bool IsTestPacked = false;
21040 X86::CondCode X86CC;
21042 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
21043 case Intrinsic::x86_avx_vtestz_ps:
21044 case Intrinsic::x86_avx_vtestz_pd:
21045 case Intrinsic::x86_avx_vtestz_ps_256:
21046 case Intrinsic::x86_avx_vtestz_pd_256:
21047 IsTestPacked = true;
21049 case Intrinsic::x86_sse41_ptestz:
21050 case Intrinsic::x86_avx_ptestz_256:
21052 X86CC = X86::COND_E;
21054 case Intrinsic::x86_avx_vtestc_ps:
21055 case Intrinsic::x86_avx_vtestc_pd:
21056 case Intrinsic::x86_avx_vtestc_ps_256:
21057 case Intrinsic::x86_avx_vtestc_pd_256:
21058 IsTestPacked = true;
21060 case Intrinsic::x86_sse41_ptestc:
21061 case Intrinsic::x86_avx_ptestc_256:
21063 X86CC = X86::COND_B;
21065 case Intrinsic::x86_avx_vtestnzc_ps:
21066 case Intrinsic::x86_avx_vtestnzc_pd:
21067 case Intrinsic::x86_avx_vtestnzc_ps_256:
21068 case Intrinsic::x86_avx_vtestnzc_pd_256:
21069 IsTestPacked = true;
21071 case Intrinsic::x86_sse41_ptestnzc:
21072 case Intrinsic::x86_avx_ptestnzc_256:
21074 X86CC = X86::COND_A;
21078 SDValue LHS = Op.getOperand(1);
21079 SDValue RHS = Op.getOperand(2);
21080 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
21081 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
21082 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
21083 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21086 case Intrinsic::x86_sse42_pcmpistria128:
21087 case Intrinsic::x86_sse42_pcmpestria128:
21088 case Intrinsic::x86_sse42_pcmpistric128:
21089 case Intrinsic::x86_sse42_pcmpestric128:
21090 case Intrinsic::x86_sse42_pcmpistrio128:
21091 case Intrinsic::x86_sse42_pcmpestrio128:
21092 case Intrinsic::x86_sse42_pcmpistris128:
21093 case Intrinsic::x86_sse42_pcmpestris128:
21094 case Intrinsic::x86_sse42_pcmpistriz128:
21095 case Intrinsic::x86_sse42_pcmpestriz128: {
21097 X86::CondCode X86CC;
21099 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21100 case Intrinsic::x86_sse42_pcmpistria128:
21101 Opcode = X86ISD::PCMPISTR;
21102 X86CC = X86::COND_A;
21104 case Intrinsic::x86_sse42_pcmpestria128:
21105 Opcode = X86ISD::PCMPESTR;
21106 X86CC = X86::COND_A;
21108 case Intrinsic::x86_sse42_pcmpistric128:
21109 Opcode = X86ISD::PCMPISTR;
21110 X86CC = X86::COND_B;
21112 case Intrinsic::x86_sse42_pcmpestric128:
21113 Opcode = X86ISD::PCMPESTR;
21114 X86CC = X86::COND_B;
21116 case Intrinsic::x86_sse42_pcmpistrio128:
21117 Opcode = X86ISD::PCMPISTR;
21118 X86CC = X86::COND_O;
21120 case Intrinsic::x86_sse42_pcmpestrio128:
21121 Opcode = X86ISD::PCMPESTR;
21122 X86CC = X86::COND_O;
21124 case Intrinsic::x86_sse42_pcmpistris128:
21125 Opcode = X86ISD::PCMPISTR;
21126 X86CC = X86::COND_S;
21128 case Intrinsic::x86_sse42_pcmpestris128:
21129 Opcode = X86ISD::PCMPESTR;
21130 X86CC = X86::COND_S;
21132 case Intrinsic::x86_sse42_pcmpistriz128:
21133 Opcode = X86ISD::PCMPISTR;
21134 X86CC = X86::COND_E;
21136 case Intrinsic::x86_sse42_pcmpestriz128:
21137 Opcode = X86ISD::PCMPESTR;
21138 X86CC = X86::COND_E;
21141 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21142 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21143 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
21144 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
21145 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21148 case Intrinsic::x86_sse42_pcmpistri128:
21149 case Intrinsic::x86_sse42_pcmpestri128: {
21151 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
21152 Opcode = X86ISD::PCMPISTR;
21154 Opcode = X86ISD::PCMPESTR;
21156 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21157 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21158 return DAG.getNode(Opcode, dl, VTs, NewOps);
21161 case Intrinsic::x86_sse42_pcmpistrm128:
21162 case Intrinsic::x86_sse42_pcmpestrm128: {
21164 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
21165 Opcode = X86ISD::PCMPISTR;
21167 Opcode = X86ISD::PCMPESTR;
21169 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21170 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21171 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
21174 case Intrinsic::eh_sjlj_lsda: {
21175 MachineFunction &MF = DAG.getMachineFunction();
21176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21177 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21178 auto &Context = MF.getMMI().getContext();
21179 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21180 Twine(MF.getFunctionNumber()));
21181 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21182 DAG.getMCSymbol(S, PtrVT));
21185 case Intrinsic::x86_seh_lsda: {
21186 // Compute the symbol for the LSDA. We know it'll get emitted later.
21187 MachineFunction &MF = DAG.getMachineFunction();
21188 SDValue Op1 = Op.getOperand(1);
21189 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21190 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21191 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21193 // Generate a simple absolute symbol reference. This intrinsic is only
21194 // supported on 32-bit Windows, which isn't PIC.
21195 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21196 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21199 case Intrinsic::x86_seh_recoverfp: {
21200 SDValue FnOp = Op.getOperand(1);
21201 SDValue IncomingFPOp = Op.getOperand(2);
21202 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21203 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21205 report_fatal_error(
21206 "llvm.x86.seh.recoverfp must take a function as the first argument");
21207 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21210 case Intrinsic::localaddress: {
21211 // Returns one of the stack, base, or frame pointer registers, depending on
21212 // which is used to reference local variables.
21213 MachineFunction &MF = DAG.getMachineFunction();
21214 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21216 if (RegInfo->hasBasePointer(MF))
21217 Reg = RegInfo->getBaseRegister();
21218 else // This function handles the SP or FP case.
21219 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21220 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21225 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21226 SDValue Src, SDValue Mask, SDValue Base,
21227 SDValue Index, SDValue ScaleOp, SDValue Chain,
21228 const X86Subtarget &Subtarget) {
21230 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21231 // Scale must be constant.
21234 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21235 EVT MaskVT = Mask.getValueType();
21236 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21237 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21238 SDValue Segment = DAG.getRegister(0, MVT::i32);
21239 // If source is undef or we know it won't be used, use a zero vector
21240 // to break register dependency.
21241 // TODO: use undef instead and let BreakFalseDeps deal with it?
21242 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21243 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21244 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21245 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21246 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21247 return DAG.getMergeValues(RetOps, dl);
21250 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21251 SDValue Src, SDValue Mask, SDValue Base,
21252 SDValue Index, SDValue ScaleOp, SDValue Chain,
21253 const X86Subtarget &Subtarget) {
21255 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21256 // Scale must be constant.
21259 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21260 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21261 Index.getSimpleValueType().getVectorNumElements());
21263 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21264 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21265 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21266 SDValue Segment = DAG.getRegister(0, MVT::i32);
21267 // If source is undef or we know it won't be used, use a zero vector
21268 // to break register dependency.
21269 // TODO: use undef instead and let BreakFalseDeps deal with it?
21270 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21271 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21272 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21273 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21274 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21275 return DAG.getMergeValues(RetOps, dl);
21278 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21279 SDValue Src, SDValue Mask, SDValue Base,
21280 SDValue Index, SDValue ScaleOp, SDValue Chain,
21281 const X86Subtarget &Subtarget) {
21283 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21284 // Scale must be constant.
21287 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21288 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21289 SDValue Segment = DAG.getRegister(0, MVT::i32);
21290 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21291 Index.getSimpleValueType().getVectorNumElements());
21293 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21294 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21295 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21296 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21297 return SDValue(Res, 1);
21300 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21301 SDValue Mask, SDValue Base, SDValue Index,
21302 SDValue ScaleOp, SDValue Chain,
21303 const X86Subtarget &Subtarget) {
21305 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21306 // Scale must be constant.
21309 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21310 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21311 SDValue Segment = DAG.getRegister(0, MVT::i32);
21313 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21314 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21315 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21316 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21317 return SDValue(Res, 0);
21320 /// Handles the lowering of builtin intrinsic that return the value
21321 /// of the extended control register.
21322 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21324 const X86Subtarget &Subtarget,
21325 SmallVectorImpl<SDValue> &Results) {
21326 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21327 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21330 // The ECX register is used to select the index of the XCR register to
21333 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21334 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21335 Chain = SDValue(N1, 0);
21337 // Reads the content of XCR and returns it in registers EDX:EAX.
21338 if (Subtarget.is64Bit()) {
21339 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21340 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21343 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21344 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21347 Chain = HI.getValue(1);
21349 if (Subtarget.is64Bit()) {
21350 // Merge the two 32-bit values into a 64-bit one..
21351 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21352 DAG.getConstant(32, DL, MVT::i8));
21353 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21354 Results.push_back(Chain);
21358 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21359 SDValue Ops[] = { LO, HI };
21360 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21361 Results.push_back(Pair);
21362 Results.push_back(Chain);
21365 /// Handles the lowering of builtin intrinsics that read performance monitor
21366 /// counters (x86_rdpmc).
21367 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21369 const X86Subtarget &Subtarget,
21370 SmallVectorImpl<SDValue> &Results) {
21371 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21372 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21375 // The ECX register is used to select the index of the performance counter
21377 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21379 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21381 // Reads the content of a 64-bit performance counter and returns it in the
21382 // registers EDX:EAX.
21383 if (Subtarget.is64Bit()) {
21384 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21385 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21388 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21389 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21392 Chain = HI.getValue(1);
21394 if (Subtarget.is64Bit()) {
21395 // The EAX register is loaded with the low-order 32 bits. The EDX register
21396 // is loaded with the supported high-order bits of the counter.
21397 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21398 DAG.getConstant(32, DL, MVT::i8));
21399 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21400 Results.push_back(Chain);
21404 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21405 SDValue Ops[] = { LO, HI };
21406 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21407 Results.push_back(Pair);
21408 Results.push_back(Chain);
21411 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21412 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21413 /// READCYCLECOUNTER nodes.
21414 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21416 const X86Subtarget &Subtarget,
21417 SmallVectorImpl<SDValue> &Results) {
21418 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21419 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21422 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21423 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21424 // and the EAX register is loaded with the low-order 32 bits.
21425 if (Subtarget.is64Bit()) {
21426 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21427 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21430 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21431 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21434 SDValue Chain = HI.getValue(1);
21436 if (Opcode == X86ISD::RDTSCP_DAG) {
21437 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21439 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21440 // the ECX register. Add 'ecx' explicitly to the chain.
21441 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21443 // Explicitly store the content of ECX at the location passed in input
21444 // to the 'rdtscp' intrinsic.
21445 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21446 MachinePointerInfo());
21449 if (Subtarget.is64Bit()) {
21450 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21451 // the EAX register is loaded with the low-order 32 bits.
21452 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21453 DAG.getConstant(32, DL, MVT::i8));
21454 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21455 Results.push_back(Chain);
21459 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21460 SDValue Ops[] = { LO, HI };
21461 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21462 Results.push_back(Pair);
21463 Results.push_back(Chain);
21466 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21467 SelectionDAG &DAG) {
21468 SmallVector<SDValue, 2> Results;
21470 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21472 return DAG.getMergeValues(Results, DL);
21475 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21476 MachineFunction &MF = DAG.getMachineFunction();
21477 SDValue Chain = Op.getOperand(0);
21478 SDValue RegNode = Op.getOperand(2);
21479 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21481 report_fatal_error("EH registrations only live in functions using WinEH");
21483 // Cast the operand to an alloca, and remember the frame index.
21484 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21486 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21487 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21489 // Return the chain operand without making any DAG nodes.
21493 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21494 MachineFunction &MF = DAG.getMachineFunction();
21495 SDValue Chain = Op.getOperand(0);
21496 SDValue EHGuard = Op.getOperand(2);
21497 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21499 report_fatal_error("EHGuard only live in functions using WinEH");
21501 // Cast the operand to an alloca, and remember the frame index.
21502 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21504 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21505 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21507 // Return the chain operand without making any DAG nodes.
21511 /// Emit Truncating Store with signed or unsigned saturation.
21513 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21514 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21515 SelectionDAG &DAG) {
21517 SDVTList VTs = DAG.getVTList(MVT::Other);
21518 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21519 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21521 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21522 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21525 /// Emit Masked Truncating Store with signed or unsigned saturation.
21527 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21528 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21529 MachineMemOperand *MMO, SelectionDAG &DAG) {
21531 SDVTList VTs = DAG.getVTList(MVT::Other);
21532 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21534 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21535 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21538 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21539 SelectionDAG &DAG) {
21540 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21542 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21545 case llvm::Intrinsic::x86_seh_ehregnode:
21546 return MarkEHRegistrationNode(Op, DAG);
21547 case llvm::Intrinsic::x86_seh_ehguard:
21548 return MarkEHGuard(Op, DAG);
21549 case llvm::Intrinsic::x86_flags_read_u32:
21550 case llvm::Intrinsic::x86_flags_read_u64:
21551 case llvm::Intrinsic::x86_flags_write_u32:
21552 case llvm::Intrinsic::x86_flags_write_u64: {
21553 // We need a frame pointer because this will get lowered to a PUSH/POP
21555 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21556 MFI.setHasCopyImplyingStackAdjustment(true);
21557 // Don't do anything here, we will expand these intrinsics out later
21558 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21561 case Intrinsic::x86_lwpins32:
21562 case Intrinsic::x86_lwpins64:
21563 case Intrinsic::x86_umwait:
21564 case Intrinsic::x86_tpause: {
21566 SDValue Chain = Op->getOperand(0);
21567 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21571 default: llvm_unreachable("Impossible intrinsic");
21572 case Intrinsic::x86_umwait:
21573 Opcode = X86ISD::UMWAIT;
21575 case Intrinsic::x86_tpause:
21576 Opcode = X86ISD::TPAUSE;
21578 case Intrinsic::x86_lwpins32:
21579 case Intrinsic::x86_lwpins64:
21580 Opcode = X86ISD::LWPINS;
21584 SDValue Operation =
21585 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
21586 Op->getOperand(3), Op->getOperand(4));
21587 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
21588 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21589 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21590 Operation.getValue(1));
21597 switch(IntrData->Type) {
21598 default: llvm_unreachable("Unknown Intrinsic Type");
21601 // Emit the node with the right value type.
21602 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21603 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21605 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21606 // Otherwise return the value from Rand, which is always 0, casted to i32.
21607 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21608 DAG.getConstant(1, dl, Op->getValueType(1)),
21609 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21610 SDValue(Result.getNode(), 1) };
21611 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21613 // Return { result, isValid, chain }.
21614 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21615 SDValue(Result.getNode(), 2));
21617 case GATHER_AVX2: {
21618 SDValue Chain = Op.getOperand(0);
21619 SDValue Src = Op.getOperand(2);
21620 SDValue Base = Op.getOperand(3);
21621 SDValue Index = Op.getOperand(4);
21622 SDValue Mask = Op.getOperand(5);
21623 SDValue Scale = Op.getOperand(6);
21624 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21625 Scale, Chain, Subtarget);
21628 //gather(v1, mask, index, base, scale);
21629 SDValue Chain = Op.getOperand(0);
21630 SDValue Src = Op.getOperand(2);
21631 SDValue Base = Op.getOperand(3);
21632 SDValue Index = Op.getOperand(4);
21633 SDValue Mask = Op.getOperand(5);
21634 SDValue Scale = Op.getOperand(6);
21635 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21639 //scatter(base, mask, index, v1, scale);
21640 SDValue Chain = Op.getOperand(0);
21641 SDValue Base = Op.getOperand(2);
21642 SDValue Mask = Op.getOperand(3);
21643 SDValue Index = Op.getOperand(4);
21644 SDValue Src = Op.getOperand(5);
21645 SDValue Scale = Op.getOperand(6);
21646 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21647 Scale, Chain, Subtarget);
21650 SDValue Hint = Op.getOperand(6);
21651 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21652 assert((HintVal == 2 || HintVal == 3) &&
21653 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21654 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21655 SDValue Chain = Op.getOperand(0);
21656 SDValue Mask = Op.getOperand(2);
21657 SDValue Index = Op.getOperand(3);
21658 SDValue Base = Op.getOperand(4);
21659 SDValue Scale = Op.getOperand(5);
21660 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21663 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21665 SmallVector<SDValue, 2> Results;
21666 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21668 return DAG.getMergeValues(Results, dl);
21670 // Read Performance Monitoring Counters.
21672 SmallVector<SDValue, 2> Results;
21673 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21674 return DAG.getMergeValues(Results, dl);
21676 // Get Extended Control Register.
21678 SmallVector<SDValue, 2> Results;
21679 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21680 return DAG.getMergeValues(Results, dl);
21682 // XTEST intrinsics.
21684 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21685 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21687 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21688 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21689 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21690 Ret, SDValue(InTrans.getNode(), 1));
21694 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21695 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21696 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21697 DAG.getConstant(-1, dl, MVT::i8));
21698 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21699 Op.getOperand(4), GenCF.getValue(1));
21700 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21701 Op.getOperand(5), MachinePointerInfo());
21702 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21703 SDValue Results[] = { SetCC, Store };
21704 return DAG.getMergeValues(Results, dl);
21706 case TRUNCATE_TO_MEM_VI8:
21707 case TRUNCATE_TO_MEM_VI16:
21708 case TRUNCATE_TO_MEM_VI32: {
21709 SDValue Mask = Op.getOperand(4);
21710 SDValue DataToTruncate = Op.getOperand(3);
21711 SDValue Addr = Op.getOperand(2);
21712 SDValue Chain = Op.getOperand(0);
21714 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21715 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21717 EVT MemVT = MemIntr->getMemoryVT();
21719 uint16_t TruncationOp = IntrData->Opc0;
21720 switch (TruncationOp) {
21721 case X86ISD::VTRUNC: {
21722 if (isAllOnesConstant(Mask)) // return just a truncate store
21723 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21724 MemIntr->getMemOperand());
21726 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21727 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21729 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21730 MemIntr->getMemOperand(), true /* truncating */);
21732 case X86ISD::VTRUNCUS:
21733 case X86ISD::VTRUNCS: {
21734 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21735 if (isAllOnesConstant(Mask))
21736 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21737 MemIntr->getMemOperand(), DAG);
21739 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21740 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21742 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21743 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21746 llvm_unreachable("Unsupported truncstore intrinsic");
21752 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21753 SelectionDAG &DAG) const {
21754 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21755 MFI.setReturnAddressIsTaken(true);
21757 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21760 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21762 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21765 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21766 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21767 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21768 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21769 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21770 MachinePointerInfo());
21773 // Just load the return address.
21774 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21775 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21776 MachinePointerInfo());
21779 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21780 SelectionDAG &DAG) const {
21781 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21782 return getReturnAddressFrameIndex(DAG);
21785 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21786 MachineFunction &MF = DAG.getMachineFunction();
21787 MachineFrameInfo &MFI = MF.getFrameInfo();
21788 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21789 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21790 EVT VT = Op.getValueType();
21792 MFI.setFrameAddressIsTaken(true);
21794 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21795 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21796 // is not possible to crawl up the stack without looking at the unwind codes
21798 int FrameAddrIndex = FuncInfo->getFAIndex();
21799 if (!FrameAddrIndex) {
21800 // Set up a frame object for the return address.
21801 unsigned SlotSize = RegInfo->getSlotSize();
21802 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21803 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21804 FuncInfo->setFAIndex(FrameAddrIndex);
21806 return DAG.getFrameIndex(FrameAddrIndex, VT);
21809 unsigned FrameReg =
21810 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21811 SDLoc dl(Op); // FIXME probably not meaningful
21812 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21813 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21814 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21815 "Invalid Frame Register!");
21816 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21818 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21819 MachinePointerInfo());
21823 // FIXME? Maybe this could be a TableGen attribute on some registers and
21824 // this table could be generated automatically from RegInfo.
21825 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21826 SelectionDAG &DAG) const {
21827 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21828 const MachineFunction &MF = DAG.getMachineFunction();
21830 unsigned Reg = StringSwitch<unsigned>(RegName)
21831 .Case("esp", X86::ESP)
21832 .Case("rsp", X86::RSP)
21833 .Case("ebp", X86::EBP)
21834 .Case("rbp", X86::RBP)
21837 if (Reg == X86::EBP || Reg == X86::RBP) {
21838 if (!TFI.hasFP(MF))
21839 report_fatal_error("register " + StringRef(RegName) +
21840 " is allocatable: function has no frame pointer");
21843 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21844 unsigned FrameReg =
21845 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21846 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21847 "Invalid Frame Register!");
21855 report_fatal_error("Invalid register name global variable");
21858 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21859 SelectionDAG &DAG) const {
21860 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21861 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21864 unsigned X86TargetLowering::getExceptionPointerRegister(
21865 const Constant *PersonalityFn) const {
21866 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21867 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21869 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21872 unsigned X86TargetLowering::getExceptionSelectorRegister(
21873 const Constant *PersonalityFn) const {
21874 // Funclet personalities don't use selectors (the runtime does the selection).
21875 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21876 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21879 bool X86TargetLowering::needsFixedCatchObjects() const {
21880 return Subtarget.isTargetWin64();
21883 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21884 SDValue Chain = Op.getOperand(0);
21885 SDValue Offset = Op.getOperand(1);
21886 SDValue Handler = Op.getOperand(2);
21889 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21890 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21891 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21892 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21893 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21894 "Invalid Frame Register!");
21895 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21896 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21898 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21899 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21901 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21902 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21903 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21905 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21906 DAG.getRegister(StoreAddrReg, PtrVT));
21909 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21910 SelectionDAG &DAG) const {
21912 // If the subtarget is not 64bit, we may need the global base reg
21913 // after isel expand pseudo, i.e., after CGBR pass ran.
21914 // Therefore, ask for the GlobalBaseReg now, so that the pass
21915 // inserts the code for us in case we need it.
21916 // Otherwise, we will end up in a situation where we will
21917 // reference a virtual register that is not defined!
21918 if (!Subtarget.is64Bit()) {
21919 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21920 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21922 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21923 DAG.getVTList(MVT::i32, MVT::Other),
21924 Op.getOperand(0), Op.getOperand(1));
21927 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21928 SelectionDAG &DAG) const {
21930 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21931 Op.getOperand(0), Op.getOperand(1));
21934 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21935 SelectionDAG &DAG) const {
21937 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21941 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21942 return Op.getOperand(0);
21945 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21946 SelectionDAG &DAG) const {
21947 SDValue Root = Op.getOperand(0);
21948 SDValue Trmp = Op.getOperand(1); // trampoline
21949 SDValue FPtr = Op.getOperand(2); // nested function
21950 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21953 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21954 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21956 if (Subtarget.is64Bit()) {
21957 SDValue OutChains[6];
21959 // Large code-model.
21960 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21961 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21963 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21964 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21966 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21968 // Load the pointer to the nested function into R11.
21969 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21970 SDValue Addr = Trmp;
21971 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21972 Addr, MachinePointerInfo(TrmpAddr));
21974 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21975 DAG.getConstant(2, dl, MVT::i64));
21977 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21978 /* Alignment = */ 2);
21980 // Load the 'nest' parameter value into R10.
21981 // R10 is specified in X86CallingConv.td
21982 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21983 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21984 DAG.getConstant(10, dl, MVT::i64));
21985 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21986 Addr, MachinePointerInfo(TrmpAddr, 10));
21988 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21989 DAG.getConstant(12, dl, MVT::i64));
21991 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21992 /* Alignment = */ 2);
21994 // Jump to the nested function.
21995 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21996 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21997 DAG.getConstant(20, dl, MVT::i64));
21998 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21999 Addr, MachinePointerInfo(TrmpAddr, 20));
22001 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
22002 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
22003 DAG.getConstant(22, dl, MVT::i64));
22004 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
22005 Addr, MachinePointerInfo(TrmpAddr, 22));
22007 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22009 const Function *Func =
22010 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
22011 CallingConv::ID CC = Func->getCallingConv();
22016 llvm_unreachable("Unsupported calling convention");
22017 case CallingConv::C:
22018 case CallingConv::X86_StdCall: {
22019 // Pass 'nest' parameter in ECX.
22020 // Must be kept in sync with X86CallingConv.td
22021 NestReg = X86::ECX;
22023 // Check that ECX wasn't needed by an 'inreg' parameter.
22024 FunctionType *FTy = Func->getFunctionType();
22025 const AttributeList &Attrs = Func->getAttributes();
22027 if (!Attrs.isEmpty() && !Func->isVarArg()) {
22028 unsigned InRegCount = 0;
22031 for (FunctionType::param_iterator I = FTy->param_begin(),
22032 E = FTy->param_end(); I != E; ++I, ++Idx)
22033 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
22034 auto &DL = DAG.getDataLayout();
22035 // FIXME: should only count parameters that are lowered to integers.
22036 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
22039 if (InRegCount > 2) {
22040 report_fatal_error("Nest register in use - reduce number of inreg"
22046 case CallingConv::X86_FastCall:
22047 case CallingConv::X86_ThisCall:
22048 case CallingConv::Fast:
22049 // Pass 'nest' parameter in EAX.
22050 // Must be kept in sync with X86CallingConv.td
22051 NestReg = X86::EAX;
22055 SDValue OutChains[4];
22056 SDValue Addr, Disp;
22058 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22059 DAG.getConstant(10, dl, MVT::i32));
22060 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
22062 // This is storing the opcode for MOV32ri.
22063 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
22064 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
22066 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
22067 Trmp, MachinePointerInfo(TrmpAddr));
22069 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22070 DAG.getConstant(1, dl, MVT::i32));
22072 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
22073 /* Alignment = */ 1);
22075 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
22076 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22077 DAG.getConstant(5, dl, MVT::i32));
22078 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
22079 Addr, MachinePointerInfo(TrmpAddr, 5),
22080 /* Alignment = */ 1);
22082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22083 DAG.getConstant(6, dl, MVT::i32));
22085 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
22086 /* Alignment = */ 1);
22088 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22092 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
22093 SelectionDAG &DAG) const {
22095 The rounding mode is in bits 11:10 of FPSR, and has the following
22097 00 Round to nearest
22102 FLT_ROUNDS, on the other hand, expects the following:
22109 To perform the conversion, we do:
22110 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
22113 MachineFunction &MF = DAG.getMachineFunction();
22114 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22115 unsigned StackAlignment = TFI.getStackAlignment();
22116 MVT VT = Op.getSimpleValueType();
22119 // Save FP Control Word to stack slot
22120 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
22121 SDValue StackSlot =
22122 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
22124 MachineMemOperand *MMO =
22125 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
22126 MachineMemOperand::MOStore, 2, 2);
22128 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
22129 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
22130 DAG.getVTList(MVT::Other),
22131 Ops, MVT::i16, MMO);
22133 // Load FP Control Word from stack slot
22135 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
22137 // Transform as necessary
22139 DAG.getNode(ISD::SRL, DL, MVT::i16,
22140 DAG.getNode(ISD::AND, DL, MVT::i16,
22141 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
22142 DAG.getConstant(11, DL, MVT::i8));
22144 DAG.getNode(ISD::SRL, DL, MVT::i16,
22145 DAG.getNode(ISD::AND, DL, MVT::i16,
22146 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
22147 DAG.getConstant(9, DL, MVT::i8));
22150 DAG.getNode(ISD::AND, DL, MVT::i16,
22151 DAG.getNode(ISD::ADD, DL, MVT::i16,
22152 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22153 DAG.getConstant(1, DL, MVT::i16)),
22154 DAG.getConstant(3, DL, MVT::i16));
22156 return DAG.getNode((VT.getSizeInBits() < 16 ?
22157 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22160 // Split an unary integer op into 2 half sized ops.
22161 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22162 MVT VT = Op.getSimpleValueType();
22163 unsigned NumElems = VT.getVectorNumElements();
22164 unsigned SizeInBits = VT.getSizeInBits();
22165 MVT EltVT = VT.getVectorElementType();
22166 SDValue Src = Op.getOperand(0);
22167 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
22168 "Src and Op should have the same element type!");
22170 // Extract the Lo/Hi vectors
22172 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22173 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
22175 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22176 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22177 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22178 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22181 // Decompose 256-bit ops into smaller 128-bit ops.
22182 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22183 assert(Op.getSimpleValueType().is256BitVector() &&
22184 Op.getSimpleValueType().isInteger() &&
22185 "Only handle AVX 256-bit vector integer operation");
22186 return LowerVectorIntUnary(Op, DAG);
22189 // Decompose 512-bit ops into smaller 256-bit ops.
22190 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22191 assert(Op.getSimpleValueType().is512BitVector() &&
22192 Op.getSimpleValueType().isInteger() &&
22193 "Only handle AVX 512-bit vector integer operation");
22194 return LowerVectorIntUnary(Op, DAG);
22197 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
22199 // i8/i16 vector implemented using dword LZCNT vector instruction
22200 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22201 // split the vector, perform operation on it's Lo a Hi part and
22202 // concatenate the results.
22203 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22204 const X86Subtarget &Subtarget) {
22205 assert(Op.getOpcode() == ISD::CTLZ);
22207 MVT VT = Op.getSimpleValueType();
22208 MVT EltVT = VT.getVectorElementType();
22209 unsigned NumElems = VT.getVectorNumElements();
22211 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22212 "Unsupported element type");
22214 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22215 if (NumElems > 16 ||
22216 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22217 return LowerVectorIntUnary(Op, DAG);
22219 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22220 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22221 "Unsupported value type for operation");
22223 // Use native supported vector instruction vplzcntd.
22224 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22225 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22226 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22227 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22229 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22232 // Lower CTLZ using a PSHUFB lookup table implementation.
22233 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22234 const X86Subtarget &Subtarget,
22235 SelectionDAG &DAG) {
22236 MVT VT = Op.getSimpleValueType();
22237 int NumElts = VT.getVectorNumElements();
22238 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22239 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22241 // Per-nibble leading zero PSHUFB lookup table.
22242 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22243 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22244 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22245 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22247 SmallVector<SDValue, 64> LUTVec;
22248 for (int i = 0; i < NumBytes; ++i)
22249 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22250 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22252 // Begin by bitcasting the input to byte vector, then split those bytes
22253 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22254 // If the hi input nibble is zero then we add both results together, otherwise
22255 // we just take the hi result (by masking the lo result to zero before the
22257 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22258 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22260 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22261 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22262 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22263 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22265 if (CurrVT.is512BitVector()) {
22266 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22267 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22268 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22270 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22273 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22274 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22275 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22276 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22278 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22279 // of the current vector width in the same way we did for the nibbles.
22280 // If the upper half of the input element is zero then add the halves'
22281 // leading zero counts together, otherwise just use the upper half's.
22282 // Double the width of the result until we are at target width.
22283 while (CurrVT != VT) {
22284 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22285 int CurrNumElts = CurrVT.getVectorNumElements();
22286 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22287 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22288 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22290 // Check if the upper half of the input element is zero.
22291 if (CurrVT.is512BitVector()) {
22292 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22293 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22294 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22295 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22297 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22298 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22300 HiZ = DAG.getBitcast(NextVT, HiZ);
22302 // Move the upper/lower halves to the lower bits as we'll be extending to
22303 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22305 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22306 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22307 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22308 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22309 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22316 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22317 const X86Subtarget &Subtarget,
22318 SelectionDAG &DAG) {
22319 MVT VT = Op.getSimpleValueType();
22321 if (Subtarget.hasCDI() &&
22322 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22323 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22324 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22326 // Decompose 256-bit ops into smaller 128-bit ops.
22327 if (VT.is256BitVector() && !Subtarget.hasInt256())
22328 return Lower256IntUnary(Op, DAG);
22330 // Decompose 512-bit ops into smaller 256-bit ops.
22331 if (VT.is512BitVector() && !Subtarget.hasBWI())
22332 return Lower512IntUnary(Op, DAG);
22334 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22335 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22338 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22339 SelectionDAG &DAG) {
22340 MVT VT = Op.getSimpleValueType();
22342 unsigned NumBits = VT.getSizeInBits();
22344 unsigned Opc = Op.getOpcode();
22347 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22349 Op = Op.getOperand(0);
22350 if (VT == MVT::i8) {
22351 // Zero extend to i32 since there is not an i8 bsr.
22353 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22356 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22357 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22358 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22360 if (Opc == ISD::CTLZ) {
22361 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22364 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22365 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22368 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22371 // Finally xor with NumBits-1.
22372 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22373 DAG.getConstant(NumBits - 1, dl, OpVT));
22376 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22380 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22381 MVT VT = Op.getSimpleValueType();
22382 unsigned NumBits = VT.getScalarSizeInBits();
22385 if (VT.isVector()) {
22386 SDValue N0 = Op.getOperand(0);
22387 SDValue Zero = DAG.getConstant(0, dl, VT);
22389 // lsb(x) = (x & -x)
22390 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22391 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22393 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22394 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22395 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22396 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22397 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22400 // cttz(x) = ctpop(lsb - 1)
22401 SDValue One = DAG.getConstant(1, dl, VT);
22402 return DAG.getNode(ISD::CTPOP, dl, VT,
22403 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22406 assert(Op.getOpcode() == ISD::CTTZ &&
22407 "Only scalar CTTZ requires custom lowering");
22409 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22410 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22411 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22413 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22416 DAG.getConstant(NumBits, dl, VT),
22417 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22420 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22423 /// Break a 256-bit integer operation into two new 128-bit ones and then
22424 /// concatenate the result back.
22425 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22426 MVT VT = Op.getSimpleValueType();
22428 assert(VT.is256BitVector() && VT.isInteger() &&
22429 "Unsupported value type for operation");
22431 unsigned NumElems = VT.getVectorNumElements();
22434 // Extract the LHS vectors
22435 SDValue LHS = Op.getOperand(0);
22436 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22437 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22439 // Extract the RHS vectors
22440 SDValue RHS = Op.getOperand(1);
22441 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22442 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22444 MVT EltVT = VT.getVectorElementType();
22445 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22447 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22448 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22449 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22452 /// Break a 512-bit integer operation into two new 256-bit ones and then
22453 /// concatenate the result back.
22454 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22455 MVT VT = Op.getSimpleValueType();
22457 assert(VT.is512BitVector() && VT.isInteger() &&
22458 "Unsupported value type for operation");
22460 unsigned NumElems = VT.getVectorNumElements();
22463 // Extract the LHS vectors
22464 SDValue LHS = Op.getOperand(0);
22465 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22466 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22468 // Extract the RHS vectors
22469 SDValue RHS = Op.getOperand(1);
22470 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22471 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22473 MVT EltVT = VT.getVectorElementType();
22474 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22476 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22477 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22478 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22481 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22482 MVT VT = Op.getSimpleValueType();
22483 if (VT.getScalarType() == MVT::i1)
22484 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22485 Op.getOperand(0), Op.getOperand(1));
22486 assert(Op.getSimpleValueType().is256BitVector() &&
22487 Op.getSimpleValueType().isInteger() &&
22488 "Only handle AVX 256-bit vector integer operation");
22489 return Lower256IntArith(Op, DAG);
22492 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22493 MVT VT = Op.getSimpleValueType();
22494 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22495 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22496 // 8-bit integer abs to NEG and CMOV.
22498 SDValue N0 = Op.getOperand(0);
22499 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22500 DAG.getConstant(0, DL, VT), N0);
22501 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22502 SDValue(Neg.getNode(), 1)};
22503 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22506 assert(Op.getSimpleValueType().is256BitVector() &&
22507 Op.getSimpleValueType().isInteger() &&
22508 "Only handle AVX 256-bit vector integer operation");
22509 return Lower256IntUnary(Op, DAG);
22512 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22513 MVT VT = Op.getSimpleValueType();
22515 // For AVX1 cases, split to use legal ops (everything but v4i64).
22516 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22517 return Lower256IntArith(Op, DAG);
22520 unsigned Opcode = Op.getOpcode();
22521 SDValue N0 = Op.getOperand(0);
22522 SDValue N1 = Op.getOperand(1);
22524 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22525 // using the SMIN/SMAX instructions and flipping the signbit back.
22526 if (VT == MVT::v8i16) {
22527 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22528 "Unexpected MIN/MAX opcode");
22529 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22530 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22531 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22532 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22533 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22534 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22537 // Else, expand to a compare/select.
22540 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22541 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22542 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22543 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22544 default: llvm_unreachable("Unknown MINMAX opcode");
22547 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22548 return DAG.getSelect(DL, VT, Cond, N0, N1);
22551 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22552 SelectionDAG &DAG) {
22554 MVT VT = Op.getSimpleValueType();
22556 if (VT.getScalarType() == MVT::i1)
22557 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22559 // Decompose 256-bit ops into smaller 128-bit ops.
22560 if (VT.is256BitVector() && !Subtarget.hasInt256())
22561 return Lower256IntArith(Op, DAG);
22563 SDValue A = Op.getOperand(0);
22564 SDValue B = Op.getOperand(1);
22566 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22567 // vector pairs, multiply and truncate.
22568 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22569 if (Subtarget.hasInt256()) {
22570 // For 512-bit vectors, split into 256-bit vectors to allow the
22571 // sign-extension to occur.
22572 if (VT == MVT::v64i8)
22573 return Lower512IntArith(Op, DAG);
22575 // For 256-bit vectors, split into 128-bit vectors to allow the
22576 // sign-extension to occur. We don't need this on AVX512BW as we can
22577 // safely sign-extend to v32i16.
22578 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22579 return Lower256IntArith(Op, DAG);
22581 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22582 return DAG.getNode(
22583 ISD::TRUNCATE, dl, VT,
22584 DAG.getNode(ISD::MUL, dl, ExVT,
22585 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22586 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22589 assert(VT == MVT::v16i8 &&
22590 "Pre-AVX2 support only supports v16i8 multiplication");
22591 MVT ExVT = MVT::v8i16;
22593 // Extract the lo parts and sign extend to i16
22594 // We're going to mask off the low byte of each result element of the
22595 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22597 const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22598 4, -1, 5, -1, 6, -1, 7, -1};
22599 SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
22600 SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
22601 ALo = DAG.getBitcast(ExVT, ALo);
22602 BLo = DAG.getBitcast(ExVT, BLo);
22604 // Extract the hi parts and sign extend to i16
22605 // We're going to mask off the low byte of each result element of the
22606 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22608 const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22609 12, -1, 13, -1, 14, -1, 15, -1};
22610 SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
22611 SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
22612 AHi = DAG.getBitcast(ExVT, AHi);
22613 BHi = DAG.getBitcast(ExVT, BHi);
22615 // Multiply, mask the lower 8bits of the lo/hi results and pack
22616 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22617 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22618 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22619 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22620 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22623 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22624 if (VT == MVT::v4i32) {
22625 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22626 "Should not custom lower when pmulld is available!");
22628 // Extract the odd parts.
22629 static const int UnpackMask[] = { 1, -1, 3, -1 };
22630 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22631 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22633 // Multiply the even parts.
22634 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22635 DAG.getBitcast(MVT::v2i64, A),
22636 DAG.getBitcast(MVT::v2i64, B));
22637 // Now multiply odd parts.
22638 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22639 DAG.getBitcast(MVT::v2i64, Aodds),
22640 DAG.getBitcast(MVT::v2i64, Bodds));
22642 Evens = DAG.getBitcast(VT, Evens);
22643 Odds = DAG.getBitcast(VT, Odds);
22645 // Merge the two vectors back together with a shuffle. This expands into 2
22647 static const int ShufMask[] = { 0, 4, 2, 6 };
22648 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22651 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22652 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22653 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
22655 // Ahi = psrlqi(a, 32);
22656 // Bhi = psrlqi(b, 32);
22658 // AloBlo = pmuludq(a, b);
22659 // AloBhi = pmuludq(a, Bhi);
22660 // AhiBlo = pmuludq(Ahi, b);
22662 // Hi = psllqi(AloBhi + AhiBlo, 32);
22663 // return AloBlo + Hi;
22664 KnownBits AKnown, BKnown;
22665 DAG.computeKnownBits(A, AKnown);
22666 DAG.computeKnownBits(B, BKnown);
22668 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22669 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22670 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22672 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22673 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22674 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22676 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22678 // Only multiply lo/hi halves that aren't known to be zero.
22679 SDValue AloBlo = Zero;
22680 if (!ALoIsZero && !BLoIsZero)
22681 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22683 SDValue AloBhi = Zero;
22684 if (!ALoIsZero && !BHiIsZero) {
22685 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22686 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22689 SDValue AhiBlo = Zero;
22690 if (!AHiIsZero && !BLoIsZero) {
22691 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22692 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22695 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22696 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22698 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22701 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22702 SelectionDAG &DAG) {
22704 MVT VT = Op.getSimpleValueType();
22706 // Decompose 256-bit ops into smaller 128-bit ops.
22707 if (VT.is256BitVector() && !Subtarget.hasInt256())
22708 return Lower256IntArith(Op, DAG);
22710 // Only i8 vectors should need custom lowering after this.
22711 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22712 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22713 "Unsupported vector type");
22715 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22716 // logical shift down the upper half and pack back to i8.
22717 SDValue A = Op.getOperand(0);
22718 SDValue B = Op.getOperand(1);
22720 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22721 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22722 unsigned Opcode = Op.getOpcode();
22723 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22724 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22726 // For 512-bit vectors, split into 256-bit vectors to allow the
22727 // sign-extension to occur.
22728 if (VT == MVT::v64i8)
22729 return Lower512IntArith(Op, DAG);
22731 // AVX2 implementations - extend xmm subvectors to ymm.
22732 if (Subtarget.hasInt256()) {
22733 unsigned NumElems = VT.getVectorNumElements();
22734 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22735 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22737 if (VT == MVT::v32i8) {
22738 if (Subtarget.canExtendTo512BW()) {
22739 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22740 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22741 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22742 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22743 DAG.getConstant(8, dl, MVT::v32i16));
22744 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22746 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22747 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22748 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22749 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22750 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22751 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22752 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22753 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22754 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22755 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22756 DAG.getConstant(8, dl, MVT::v16i16));
22757 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22758 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22759 DAG.getConstant(8, dl, MVT::v16i16));
22760 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22761 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22762 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22763 16, 17, 18, 19, 20, 21, 22, 23};
22764 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22765 24, 25, 26, 27, 28, 29, 30, 31};
22766 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22767 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22768 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22771 assert(VT == MVT::v16i8 && "Unexpected VT");
22773 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22774 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22775 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22776 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22777 DAG.getConstant(8, dl, MVT::v16i16));
22778 // If we have BWI we can use truncate instruction.
22779 if (Subtarget.hasBWI())
22780 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22781 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22782 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22783 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22786 assert(VT == MVT::v16i8 &&
22787 "Pre-AVX2 support only supports v16i8 multiplication");
22788 MVT ExVT = MVT::v8i16;
22789 unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
22790 : ISD::SIGN_EXTEND_VECTOR_INREG;
22792 // Extract the lo parts and zero/sign extend to i16.
22794 if (Subtarget.hasSSE41()) {
22795 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
22796 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
22798 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22799 -1, 4, -1, 5, -1, 6, -1, 7};
22800 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22801 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22802 ALo = DAG.getBitcast(ExVT, ALo);
22803 BLo = DAG.getBitcast(ExVT, BLo);
22804 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22805 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22808 // Extract the hi parts and zero/sign extend to i16.
22810 if (Subtarget.hasSSE41()) {
22811 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22812 -1, -1, -1, -1, -1, -1, -1, -1};
22813 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22814 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22815 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
22816 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
22818 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22819 -1, 12, -1, 13, -1, 14, -1, 15};
22820 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22821 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22822 AHi = DAG.getBitcast(ExVT, AHi);
22823 BHi = DAG.getBitcast(ExVT, BHi);
22824 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22825 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22828 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22829 // pack back to v16i8.
22830 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22831 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22832 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22833 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22834 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22837 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22838 assert(Subtarget.isTargetWin64() && "Unexpected target");
22839 EVT VT = Op.getValueType();
22840 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22841 "Unexpected return type for lowering");
22845 switch (Op->getOpcode()) {
22846 default: llvm_unreachable("Unexpected request for libcall!");
22847 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22848 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22849 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22850 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22851 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22852 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22856 SDValue InChain = DAG.getEntryNode();
22858 TargetLowering::ArgListTy Args;
22859 TargetLowering::ArgListEntry Entry;
22860 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22861 EVT ArgVT = Op->getOperand(i).getValueType();
22862 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22863 "Unexpected argument type for lowering");
22864 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22865 Entry.Node = StackPtr;
22866 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22867 MachinePointerInfo(), /* Alignment = */ 16);
22868 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22869 Entry.Ty = PointerType::get(ArgTy,0);
22870 Entry.IsSExt = false;
22871 Entry.IsZExt = false;
22872 Args.push_back(Entry);
22875 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22876 getPointerTy(DAG.getDataLayout()));
22878 TargetLowering::CallLoweringInfo CLI(DAG);
22879 CLI.setDebugLoc(dl)
22882 getLibcallCallingConv(LC),
22883 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22886 .setSExtResult(isSigned)
22887 .setZExtResult(!isSigned);
22889 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22890 return DAG.getBitcast(VT, CallInfo.first);
22893 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22894 SelectionDAG &DAG) {
22895 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22896 MVT VT = Op0.getSimpleValueType();
22899 // Decompose 256-bit ops into smaller 128-bit ops.
22900 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22901 unsigned Opcode = Op.getOpcode();
22902 unsigned NumElems = VT.getVectorNumElements();
22903 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22904 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22905 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22906 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22907 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22908 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22909 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22911 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22912 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22914 return DAG.getMergeValues(Ops, dl);
22917 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22918 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22919 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22921 int NumElts = VT.getVectorNumElements();
22923 // PMULxD operations multiply each even value (starting at 0) of LHS with
22924 // the related value of RHS and produce a widen result.
22925 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22926 // => <2 x i64> <ae|cg>
22928 // In other word, to have all the results, we need to perform two PMULxD:
22929 // 1. one with the even values.
22930 // 2. one with the odd values.
22931 // To achieve #2, with need to place the odd values at an even position.
22933 // Place the odd value at an even position (basically, shift all values 1
22934 // step to the left):
22935 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22936 // <a|b|c|d> => <b|undef|d|undef>
22937 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22938 makeArrayRef(&Mask[0], NumElts));
22939 // <e|f|g|h> => <f|undef|h|undef>
22940 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22941 makeArrayRef(&Mask[0], NumElts));
22943 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22945 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22946 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22948 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22949 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22950 // => <2 x i64> <ae|cg>
22951 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22952 DAG.getBitcast(MulVT, Op0),
22953 DAG.getBitcast(MulVT, Op1)));
22954 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22955 // => <2 x i64> <bf|dh>
22956 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22957 DAG.getBitcast(MulVT, Odd0),
22958 DAG.getBitcast(MulVT, Odd1)));
22960 // Shuffle it back into the right order.
22961 SmallVector<int, 16> HighMask(NumElts);
22962 SmallVector<int, 16> LowMask(NumElts);
22963 for (int i = 0; i != NumElts; ++i) {
22964 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22965 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22968 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22969 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22971 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22972 // unsigned multiply.
22973 if (IsSigned && !Subtarget.hasSSE41()) {
22974 SDValue ShAmt = DAG.getConstant(
22976 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22977 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22978 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22979 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22980 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22982 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22983 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22986 // The first result of MUL_LOHI is actually the low value, followed by the
22988 SDValue Ops[] = {Lows, Highs};
22989 return DAG.getMergeValues(Ops, dl);
22992 // Return true if the required (according to Opcode) shift-imm form is natively
22993 // supported by the Subtarget
22994 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22996 if (VT.getScalarSizeInBits() < 16)
22999 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
23000 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
23003 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
23004 (VT.is256BitVector() && Subtarget.hasInt256());
23006 bool AShift = LShift && (Subtarget.hasAVX512() ||
23007 (VT != MVT::v2i64 && VT != MVT::v4i64));
23008 return (Opcode == ISD::SRA) ? AShift : LShift;
23011 // The shift amount is a variable, but it is the same for all vector lanes.
23012 // These instructions are defined together with shift-immediate.
23014 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
23016 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
23019 // Return true if the required (according to Opcode) variable-shift form is
23020 // natively supported by the Subtarget
23021 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
23024 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
23027 // vXi16 supported only on AVX-512, BWI
23028 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
23031 if (Subtarget.hasAVX512())
23034 bool LShift = VT.is128BitVector() || VT.is256BitVector();
23035 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
23036 return (Opcode == ISD::SRA) ? AShift : LShift;
23039 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
23040 const X86Subtarget &Subtarget) {
23041 MVT VT = Op.getSimpleValueType();
23043 SDValue R = Op.getOperand(0);
23044 SDValue Amt = Op.getOperand(1);
23046 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23047 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23049 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
23050 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
23051 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
23052 SDValue Ex = DAG.getBitcast(ExVT, R);
23054 // ashr(R, 63) === cmp_slt(R, 0)
23055 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
23056 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
23057 "Unsupported PCMPGT op");
23058 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
23059 getZeroVector(VT, Subtarget, DAG, dl), R);
23062 if (ShiftAmt >= 32) {
23063 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
23065 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
23066 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23067 ShiftAmt - 32, DAG);
23068 if (VT == MVT::v2i64)
23069 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
23070 if (VT == MVT::v4i64)
23071 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23072 {9, 1, 11, 3, 13, 5, 15, 7});
23074 // SRA upper i32, SHL whole i64 and select lower i32.
23075 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23078 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
23079 Lower = DAG.getBitcast(ExVT, Lower);
23080 if (VT == MVT::v2i64)
23081 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
23082 if (VT == MVT::v4i64)
23083 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23084 {8, 1, 10, 3, 12, 5, 14, 7});
23086 return DAG.getBitcast(VT, Ex);
23089 // Optimize shl/srl/sra with constant shift amount.
23090 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23091 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
23092 uint64_t ShiftAmt = ShiftConst->getZExtValue();
23094 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23095 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23097 // i64 SRA needs to be performed as partial shifts.
23098 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
23099 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
23100 Op.getOpcode() == ISD::SRA)
23101 return ArithmeticShiftRight64(ShiftAmt);
23103 if (VT == MVT::v16i8 ||
23104 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
23105 VT == MVT::v64i8) {
23106 unsigned NumElts = VT.getVectorNumElements();
23107 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23109 // Simple i8 add case
23110 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
23111 return DAG.getNode(ISD::ADD, dl, VT, R, R);
23113 // ashr(R, 7) === cmp_slt(R, 0)
23114 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
23115 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
23116 if (VT.is512BitVector()) {
23117 assert(VT == MVT::v64i8 && "Unexpected element type!");
23118 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
23120 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
23122 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23125 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23126 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23129 if (Op.getOpcode() == ISD::SHL) {
23130 // Make a large shift.
23131 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23133 SHL = DAG.getBitcast(VT, SHL);
23134 // Zero out the rightmost bits.
23135 return DAG.getNode(ISD::AND, dl, VT, SHL,
23136 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23138 if (Op.getOpcode() == ISD::SRL) {
23139 // Make a large shift.
23140 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23142 SRL = DAG.getBitcast(VT, SRL);
23143 // Zero out the leftmost bits.
23144 return DAG.getNode(ISD::AND, dl, VT, SRL,
23145 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23147 if (Op.getOpcode() == ISD::SRA) {
23148 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23149 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23151 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23152 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23153 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23156 llvm_unreachable("Unknown shift opcode.");
23161 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23162 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23163 if (!Subtarget.hasXOP() &&
23164 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23165 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23167 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23168 unsigned SubVectorScale = 1;
23169 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23171 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23172 Amt = Amt.getOperand(0);
23175 // Peek through any splat that was introduced for i64 shift vectorization.
23176 int SplatIndex = -1;
23177 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23178 if (SVN->isSplat()) {
23179 SplatIndex = SVN->getSplatIndex();
23180 Amt = Amt.getOperand(0);
23181 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23182 "Splat shuffle referencing second operand");
23185 if (Amt.getOpcode() != ISD::BITCAST ||
23186 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23189 Amt = Amt.getOperand(0);
23190 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23191 (SubVectorScale * VT.getVectorNumElements());
23192 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23193 uint64_t ShiftAmt = 0;
23194 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23195 for (unsigned i = 0; i != Ratio; ++i) {
23196 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23200 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23203 // Check remaining shift amounts (if not a splat).
23204 if (SplatIndex < 0) {
23205 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23206 uint64_t ShAmt = 0;
23207 for (unsigned j = 0; j != Ratio; ++j) {
23208 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23212 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23214 if (ShAmt != ShiftAmt)
23219 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23220 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23222 if (Op.getOpcode() == ISD::SRA)
23223 return ArithmeticShiftRight64(ShiftAmt);
23229 // Determine if V is a splat value, and return the scalar.
23230 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
23231 SelectionDAG &DAG, const X86Subtarget &Subtarget,
23233 V = peekThroughEXTRACT_SUBVECTORs(V);
23235 // Check if this is a splat build_vector node.
23236 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
23237 SDValue SplatAmt = BV->getSplatValue();
23238 if (SplatAmt && SplatAmt.isUndef())
23243 // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
23244 if (V.getOpcode() == ISD::SUB &&
23245 !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
23246 SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
23247 SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
23249 // Ensure that the corresponding splat BV element is not UNDEF.
23250 BitVector UndefElts;
23251 BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
23252 ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23253 if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
23254 unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
23255 if (!UndefElts[SplatIdx])
23256 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23257 VT.getVectorElementType(), V,
23258 DAG.getIntPtrConstant(SplatIdx, dl));
23262 // Check if this is a shuffle node doing a splat.
23263 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
23264 if (!SVN || !SVN->isSplat())
23267 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23268 SDValue InVec = V.getOperand(0);
23269 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23270 assert((SplatIdx < VT.getVectorNumElements()) &&
23271 "Unexpected shuffle index found!");
23272 return InVec.getOperand(SplatIdx);
23273 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23274 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
23275 if (C->getZExtValue() == SplatIdx)
23276 return InVec.getOperand(1);
23279 // Avoid introducing an extract element from a shuffle.
23280 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23281 VT.getVectorElementType(), InVec,
23282 DAG.getIntPtrConstant(SplatIdx, dl));
23285 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23286 const X86Subtarget &Subtarget) {
23287 MVT VT = Op.getSimpleValueType();
23289 SDValue R = Op.getOperand(0);
23290 SDValue Amt = Op.getOperand(1);
23291 unsigned Opcode = Op.getOpcode();
23293 unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
23294 (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23296 unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
23297 (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23299 Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
23301 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
23302 if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
23303 MVT EltVT = VT.getVectorElementType();
23304 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23305 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23306 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23307 else if (EltVT.bitsLT(MVT::i32))
23308 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23310 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23314 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23315 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23316 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23317 Amt = Amt.getOperand(0);
23318 unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
23319 std::vector<SDValue> Vals(Ratio);
23320 for (unsigned i = 0; i != Ratio; ++i)
23321 Vals[i] = Amt.getOperand(i);
23322 for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
23323 for (unsigned j = 0; j != Ratio; ++j)
23324 if (Vals[j] != Amt.getOperand(i + j))
23328 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23329 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23334 // Convert a shift/rotate left amount to a multiplication scale factor.
23335 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
23336 const X86Subtarget &Subtarget,
23337 SelectionDAG &DAG) {
23338 MVT VT = Amt.getSimpleValueType();
23339 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
23340 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
23341 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
23344 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
23345 SmallVector<SDValue, 8> Elts;
23346 MVT SVT = VT.getVectorElementType();
23347 unsigned SVTBits = SVT.getSizeInBits();
23348 APInt One(SVTBits, 1);
23349 unsigned NumElems = VT.getVectorNumElements();
23351 for (unsigned i = 0; i != NumElems; ++i) {
23352 SDValue Op = Amt->getOperand(i);
23353 if (Op->isUndef()) {
23354 Elts.push_back(Op);
23358 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23359 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23360 uint64_t ShAmt = C.getZExtValue();
23361 if (ShAmt >= SVTBits) {
23362 Elts.push_back(DAG.getUNDEF(SVT));
23365 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23367 return DAG.getBuildVector(VT, dl, Elts);
23370 // If the target doesn't support variable shifts, use either FP conversion
23371 // or integer multiplication to avoid shifting each element individually.
23372 if (VT == MVT::v4i32) {
23373 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23374 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
23375 DAG.getConstant(0x3f800000U, dl, VT));
23376 Amt = DAG.getBitcast(MVT::v4f32, Amt);
23377 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
23380 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
23381 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
23382 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23383 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
23384 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
23385 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
23386 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
23387 if (Subtarget.hasSSE41())
23388 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23390 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
23391 DAG.getBitcast(VT, Hi),
23392 {0, 2, 4, 6, 8, 10, 12, 14});
23398 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23399 SelectionDAG &DAG) {
23400 MVT VT = Op.getSimpleValueType();
23402 SDValue R = Op.getOperand(0);
23403 SDValue Amt = Op.getOperand(1);
23404 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23406 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23407 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23409 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23412 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23415 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23418 // XOP has 128-bit variable logical/arithmetic shifts.
23419 // +ve/-ve Amt = shift left/right.
23420 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23421 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23422 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23423 SDValue Zero = DAG.getConstant(0, dl, VT);
23424 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23426 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23427 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23428 if (Op.getOpcode() == ISD::SRA)
23429 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23432 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23433 // shifts per-lane and then shuffle the partial results back together.
23434 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23435 // Splat the shift amounts so the scalar shifts above will catch it.
23436 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23437 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23438 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23439 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23440 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23443 // i64 vector arithmetic shift can be emulated with the transform:
23444 // M = lshr(SIGN_MASK, Amt)
23445 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23446 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23447 Op.getOpcode() == ISD::SRA) {
23448 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23449 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23450 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23451 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23452 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23456 // If possible, lower this shift as a sequence of two shifts by
23457 // constant plus a BLENDing shuffle instead of scalarizing it.
23459 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23461 // Could be rewritten as:
23462 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23464 // The advantage is that the two shifts from the example would be
23465 // lowered as X86ISD::VSRLI nodes in parallel before blending.
23466 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23467 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
23468 SDValue Amt1, Amt2;
23469 unsigned NumElts = VT.getVectorNumElements();
23470 SmallVector<int, 8> ShuffleMask;
23471 for (unsigned i = 0; i != NumElts; ++i) {
23472 SDValue A = Amt->getOperand(i);
23474 ShuffleMask.push_back(SM_SentinelUndef);
23477 if (!Amt1 || Amt1 == A) {
23478 ShuffleMask.push_back(i);
23482 if (!Amt2 || Amt2 == A) {
23483 ShuffleMask.push_back(i + NumElts);
23490 // Only perform this blend if we can perform it without loading a mask.
23491 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
23492 isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
23493 (VT != MVT::v16i16 ||
23494 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
23495 (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
23496 Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
23498 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23499 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23501 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23502 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23503 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
23507 // If possible, lower this packed shift into a vector multiply instead of
23508 // expanding it into a sequence of scalar shifts.
23509 if (Op.getOpcode() == ISD::SHL)
23510 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
23511 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
23513 // Constant ISD::SRL can be performed efficiently on vXi8/vXi16 vectors as we
23514 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
23515 // TODO: Improve support for the shift by zero special case.
23516 if (Op.getOpcode() == ISD::SRL && ConstantAmt &&
23517 ((Subtarget.hasSSE41() && VT == MVT::v8i16) ||
23518 DAG.isKnownNeverZero(Amt)) &&
23519 (VT == MVT::v16i8 || VT == MVT::v8i16 ||
23520 ((VT == MVT::v32i8 || VT == MVT::v16i16) && Subtarget.hasInt256()))) {
23521 SDValue EltBits = DAG.getConstant(VT.getScalarSizeInBits(), dl, VT);
23522 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
23523 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
23524 SDValue Zero = DAG.getConstant(0, dl, VT);
23525 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
23526 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
23527 return DAG.getSelect(dl, VT, ZAmt, R, Res);
23531 // v4i32 Non Uniform Shifts.
23532 // If the shift amount is constant we can shift each lane using the SSE2
23533 // immediate shifts, else we need to zero-extend each lane to the lower i64
23534 // and shift using the SSE2 variable shifts.
23535 // The separate results can then be blended together.
23536 if (VT == MVT::v4i32) {
23537 unsigned Opc = Op.getOpcode();
23538 SDValue Amt0, Amt1, Amt2, Amt3;
23540 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23541 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23542 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23543 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23545 // ISD::SHL is handled above but we include it here for completeness.
23548 llvm_unreachable("Unknown target vector shift node");
23550 Opc = X86ISD::VSHL;
23553 Opc = X86ISD::VSRL;
23556 Opc = X86ISD::VSRA;
23559 // The SSE2 shifts use the lower i64 as the same shift amount for
23560 // all lanes and the upper i64 is ignored. On AVX we're better off
23561 // just zero-extending, but for SSE just duplicating the top 16-bits is
23562 // cheaper and has the same effect for out of range values.
23563 if (Subtarget.hasAVX()) {
23564 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23565 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23566 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23567 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23568 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23570 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
23571 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23572 {4, 5, 6, 7, -1, -1, -1, -1});
23573 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23574 {0, 1, 1, 1, -1, -1, -1, -1});
23575 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23576 {2, 3, 3, 3, -1, -1, -1, -1});
23577 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23578 {0, 1, 1, 1, -1, -1, -1, -1});
23579 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23580 {2, 3, 3, 3, -1, -1, -1, -1});
23584 SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
23585 SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
23586 SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
23587 SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
23589 // Merge the shifted lane results optimally with/without PBLENDW.
23590 // TODO - ideally shuffle combining would handle this.
23591 if (Subtarget.hasSSE41()) {
23592 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23593 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23594 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23596 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
23597 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
23598 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
23601 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23602 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23603 // make the existing SSE solution better.
23604 // NOTE: We honor prefered vector width before promoting to 512-bits.
23605 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23606 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23607 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23608 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23609 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23610 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23611 "Unexpected vector type");
23612 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23613 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23615 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23616 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23617 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23618 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23619 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23622 if (VT == MVT::v16i8 ||
23623 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23624 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23625 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23626 unsigned ShiftOpcode = Op->getOpcode();
23628 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23629 if (VT.is512BitVector()) {
23630 // On AVX512BW targets we make use of the fact that VSELECT lowers
23631 // to a masked blend which selects bytes based just on the sign bit
23632 // extracted to a mask.
23633 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23634 V0 = DAG.getBitcast(VT, V0);
23635 V1 = DAG.getBitcast(VT, V1);
23636 Sel = DAG.getBitcast(VT, Sel);
23637 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
23639 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23640 } else if (Subtarget.hasSSE41()) {
23641 // On SSE41 targets we make use of the fact that VSELECT lowers
23642 // to PBLENDVB which selects bytes based just on the sign bit.
23643 V0 = DAG.getBitcast(VT, V0);
23644 V1 = DAG.getBitcast(VT, V1);
23645 Sel = DAG.getBitcast(VT, Sel);
23646 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23648 // On pre-SSE41 targets we test for the sign bit by comparing to
23649 // zero - a negative value will set all bits of the lanes to true
23650 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23651 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23652 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23653 return DAG.getSelect(dl, SelVT, C, V0, V1);
23656 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23657 // We can safely do this using i16 shifts as we're only interested in
23658 // the 3 lower bits of each byte.
23659 Amt = DAG.getBitcast(ExtVT, Amt);
23660 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23661 Amt = DAG.getBitcast(VT, Amt);
23663 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23664 // r = VSELECT(r, shift(r, 4), a);
23666 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23667 R = SignBitSelect(VT, Amt, M, R);
23670 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23672 // r = VSELECT(r, shift(r, 2), a);
23673 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23674 R = SignBitSelect(VT, Amt, M, R);
23677 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23679 // return VSELECT(r, shift(r, 1), a);
23680 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23681 R = SignBitSelect(VT, Amt, M, R);
23685 if (Op->getOpcode() == ISD::SRA) {
23686 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23687 // so we can correctly sign extend. We don't care what happens to the
23689 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23690 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23691 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23692 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23693 ALo = DAG.getBitcast(ExtVT, ALo);
23694 AHi = DAG.getBitcast(ExtVT, AHi);
23695 RLo = DAG.getBitcast(ExtVT, RLo);
23696 RHi = DAG.getBitcast(ExtVT, RHi);
23698 // r = VSELECT(r, shift(r, 4), a);
23699 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23700 DAG.getConstant(4, dl, ExtVT));
23701 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23702 DAG.getConstant(4, dl, ExtVT));
23703 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23704 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23707 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23708 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23710 // r = VSELECT(r, shift(r, 2), a);
23711 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23712 DAG.getConstant(2, dl, ExtVT));
23713 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23714 DAG.getConstant(2, dl, ExtVT));
23715 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23716 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23719 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23720 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23722 // r = VSELECT(r, shift(r, 1), a);
23723 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23724 DAG.getConstant(1, dl, ExtVT));
23725 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23726 DAG.getConstant(1, dl, ExtVT));
23727 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23728 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23730 // Logical shift the result back to the lower byte, leaving a zero upper
23732 // meaning that we can safely pack with PACKUSWB.
23734 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23736 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23737 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23741 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23742 MVT ExtVT = MVT::v8i32;
23743 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23744 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23745 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23746 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23747 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23748 ALo = DAG.getBitcast(ExtVT, ALo);
23749 AHi = DAG.getBitcast(ExtVT, AHi);
23750 RLo = DAG.getBitcast(ExtVT, RLo);
23751 RHi = DAG.getBitcast(ExtVT, RHi);
23752 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23753 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23754 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23755 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23756 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23759 if (VT == MVT::v8i16) {
23760 unsigned ShiftOpcode = Op->getOpcode();
23762 // If we have a constant shift amount, the non-SSE41 path is best as
23763 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23764 bool UseSSE41 = Subtarget.hasSSE41() &&
23765 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23767 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23768 // On SSE41 targets we make use of the fact that VSELECT lowers
23769 // to PBLENDVB which selects bytes based just on the sign bit.
23771 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23772 V0 = DAG.getBitcast(ExtVT, V0);
23773 V1 = DAG.getBitcast(ExtVT, V1);
23774 Sel = DAG.getBitcast(ExtVT, Sel);
23775 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23777 // On pre-SSE41 targets we splat the sign bit - a negative value will
23778 // set all bits of the lanes to true and VSELECT uses that in
23779 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23781 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23782 return DAG.getSelect(dl, VT, C, V0, V1);
23785 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23787 // On SSE41 targets we need to replicate the shift mask in both
23788 // bytes for PBLENDVB.
23791 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23792 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23794 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23797 // r = VSELECT(r, shift(r, 8), a);
23798 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23799 R = SignBitSelect(Amt, M, R);
23802 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23804 // r = VSELECT(r, shift(r, 4), a);
23805 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23806 R = SignBitSelect(Amt, M, R);
23809 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23811 // r = VSELECT(r, shift(r, 2), a);
23812 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23813 R = SignBitSelect(Amt, M, R);
23816 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23818 // return VSELECT(r, shift(r, 1), a);
23819 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23820 R = SignBitSelect(Amt, M, R);
23824 // Decompose 256-bit shifts into smaller 128-bit shifts.
23825 if (VT.is256BitVector())
23826 return Lower256IntArith(Op, DAG);
23831 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23832 SelectionDAG &DAG) {
23833 MVT VT = Op.getSimpleValueType();
23834 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23837 SDValue R = Op.getOperand(0);
23838 SDValue Amt = Op.getOperand(1);
23839 unsigned Opcode = Op.getOpcode();
23840 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23842 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
23843 // Attempt to rotate by immediate.
23845 SmallVector<APInt, 16> EltBits;
23846 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23847 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23848 return EltBits[0] == V;
23850 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23851 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23852 return DAG.getNode(Op, DL, VT, R,
23853 DAG.getConstant(RotateAmt, DL, MVT::i8));
23857 // Else, fall-back on VPROLV/VPRORV.
23861 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23863 // XOP has 128-bit vector variable + immediate rotates.
23864 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23865 if (Subtarget.hasXOP()) {
23866 // Split 256-bit integers.
23867 if (VT.is256BitVector())
23868 return Lower256IntArith(Op, DAG);
23869 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23871 // Attempt to rotate by immediate.
23872 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23873 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23874 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23875 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23876 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23877 DAG.getConstant(RotateAmt, DL, MVT::i8));
23881 // Use general rotate by variable (per-element).
23885 // Split 256-bit integers on pre-AVX2 targets.
23886 if (VT.is256BitVector() && !Subtarget.hasAVX2())
23887 return Lower256IntArith(Op, DAG);
23889 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
23890 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
23891 Subtarget.hasAVX2())) &&
23892 "Only vXi32/vXi16/vXi8 vector rotates supported");
23894 // Rotate by an uniform constant - expand back to shifts.
23895 // TODO - legalizers should be able to handle this.
23896 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23897 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23898 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23899 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23900 if (RotateAmt == 0)
23903 SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
23904 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23905 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23906 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23910 // Rotate by splat - expand back to shifts.
23911 // TODO - legalizers should be able to handle this.
23912 if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
23913 IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
23914 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23915 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23916 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23917 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23918 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23921 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
23923 if (EltSizeInBits == 8) {
23924 if (Subtarget.hasBWI()) {
23925 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23926 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23927 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23928 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23929 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23932 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23934 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23935 if (Subtarget.hasSSE41()) {
23936 // On SSE41 targets we make use of the fact that VSELECT lowers
23937 // to PBLENDVB which selects bytes based just on the sign bit.
23938 V0 = DAG.getBitcast(VT, V0);
23939 V1 = DAG.getBitcast(VT, V1);
23940 Sel = DAG.getBitcast(VT, Sel);
23941 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
23943 // On pre-SSE41 targets we test for the sign bit by comparing to
23944 // zero - a negative value will set all bits of the lanes to true
23945 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23946 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
23947 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
23948 return DAG.getSelect(DL, SelVT, C, V0, V1);
23951 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23952 // We can safely do this using i16 shifts as we're only interested in
23953 // the 3 lower bits of each byte.
23954 Amt = DAG.getBitcast(ExtVT, Amt);
23955 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
23956 Amt = DAG.getBitcast(VT, Amt);
23958 // r = VSELECT(r, rot(r, 4), a);
23962 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
23963 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
23964 R = SignBitSelect(VT, Amt, M, R);
23967 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23969 // r = VSELECT(r, rot(r, 2), a);
23972 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
23973 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
23974 R = SignBitSelect(VT, Amt, M, R);
23977 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23979 // return VSELECT(r, rot(r, 1), a);
23982 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
23983 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
23984 return SignBitSelect(VT, Amt, M, R);
23987 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23988 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
23989 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
23991 // Best to fallback for all supported variable shifts.
23992 // AVX2 - best to fallback for non-constants as well.
23993 // TODO - legalizers should be able to handle this.
23994 if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
23995 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23996 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23997 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23998 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23999 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
24002 // As with shifts, convert the rotation amount to a multiplication factor.
24003 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
24004 assert(Scale && "Failed to convert ROTL amount to scale");
24006 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
24007 if (EltSizeInBits == 16) {
24008 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
24009 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
24010 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24013 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
24014 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
24015 // that can then be OR'd with the lower 32-bits.
24016 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
24017 static const int OddMask[] = {1, -1, 3, -1};
24018 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
24019 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
24021 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24022 DAG.getBitcast(MVT::v2i64, R),
24023 DAG.getBitcast(MVT::v2i64, Scale));
24024 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
24025 DAG.getBitcast(MVT::v2i64, R13),
24026 DAG.getBitcast(MVT::v2i64, Scale13));
24027 Res02 = DAG.getBitcast(VT, Res02);
24028 Res13 = DAG.getBitcast(VT, Res13);
24030 return DAG.getNode(ISD::OR, DL, VT,
24031 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
24032 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
24035 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24036 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24037 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24038 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24039 // has only one use.
24040 SDNode *N = Op.getNode();
24041 SDValue LHS = N->getOperand(0);
24042 SDValue RHS = N->getOperand(1);
24043 unsigned BaseOp = 0;
24044 X86::CondCode Cond;
24046 switch (Op.getOpcode()) {
24047 default: llvm_unreachable("Unknown ovf instruction!");
24049 // A subtract of one will be selected as a INC. Note that INC doesn't
24050 // set CF, so we can't do this for UADDO.
24051 if (isOneConstant(RHS)) {
24052 BaseOp = X86ISD::INC;
24053 Cond = X86::COND_O;
24056 BaseOp = X86ISD::ADD;
24057 Cond = X86::COND_O;
24060 BaseOp = X86ISD::ADD;
24061 Cond = X86::COND_B;
24064 // A subtract of one will be selected as a DEC. Note that DEC doesn't
24065 // set CF, so we can't do this for USUBO.
24066 if (isOneConstant(RHS)) {
24067 BaseOp = X86ISD::DEC;
24068 Cond = X86::COND_O;
24071 BaseOp = X86ISD::SUB;
24072 Cond = X86::COND_O;
24075 BaseOp = X86ISD::SUB;
24076 Cond = X86::COND_B;
24079 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
24080 Cond = X86::COND_O;
24082 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
24083 if (N->getValueType(0) == MVT::i8) {
24084 BaseOp = X86ISD::UMUL8;
24085 Cond = X86::COND_O;
24088 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
24090 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
24092 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
24094 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24098 // Also sets EFLAGS.
24099 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
24100 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24102 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
24104 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24107 /// Returns true if the operand type is exactly twice the native width, and
24108 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
24109 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
24110 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
24111 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
24112 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
24115 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
24116 else if (OpWidth == 128)
24117 return Subtarget.hasCmpxchg16b();
24122 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
24123 return needsCmpXchgNb(SI->getValueOperand()->getType());
24126 // Note: this turns large loads into lock cmpxchg8b/16b.
24127 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
24128 TargetLowering::AtomicExpansionKind
24129 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
24130 auto PTy = cast<PointerType>(LI->getPointerOperandType());
24131 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
24132 : AtomicExpansionKind::None;
24135 TargetLowering::AtomicExpansionKind
24136 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
24137 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24138 Type *MemType = AI->getType();
24140 // If the operand is too big, we must see if cmpxchg8/16b is available
24141 // and default to library calls otherwise.
24142 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
24143 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
24144 : AtomicExpansionKind::None;
24147 AtomicRMWInst::BinOp Op = AI->getOperation();
24150 llvm_unreachable("Unknown atomic operation");
24151 case AtomicRMWInst::Xchg:
24152 case AtomicRMWInst::Add:
24153 case AtomicRMWInst::Sub:
24154 // It's better to use xadd, xsub or xchg for these in all cases.
24155 return AtomicExpansionKind::None;
24156 case AtomicRMWInst::Or:
24157 case AtomicRMWInst::And:
24158 case AtomicRMWInst::Xor:
24159 // If the atomicrmw's result isn't actually used, we can just add a "lock"
24160 // prefix to a normal instruction for these operations.
24161 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
24162 : AtomicExpansionKind::None;
24163 case AtomicRMWInst::Nand:
24164 case AtomicRMWInst::Max:
24165 case AtomicRMWInst::Min:
24166 case AtomicRMWInst::UMax:
24167 case AtomicRMWInst::UMin:
24168 // These always require a non-trivial set of data operations on x86. We must
24169 // use a cmpxchg loop.
24170 return AtomicExpansionKind::CmpXChg;
24175 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
24176 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24177 Type *MemType = AI->getType();
24178 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
24179 // there is no benefit in turning such RMWs into loads, and it is actually
24180 // harmful as it introduces a mfence.
24181 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
24184 auto Builder = IRBuilder<>(AI);
24185 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
24186 auto SSID = AI->getSyncScopeID();
24187 // We must restrict the ordering to avoid generating loads with Release or
24188 // ReleaseAcquire orderings.
24189 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
24190 auto Ptr = AI->getPointerOperand();
24192 // Before the load we need a fence. Here is an example lifted from
24193 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
24196 // x.store(1, relaxed);
24197 // r1 = y.fetch_add(0, release);
24199 // y.fetch_add(42, acquire);
24200 // r2 = x.load(relaxed);
24201 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
24202 // lowered to just a load without a fence. A mfence flushes the store buffer,
24203 // making the optimization clearly correct.
24204 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
24205 // otherwise, we might be able to be more aggressive on relaxed idempotent
24206 // rmw. In practice, they do not look useful, so we don't try to be
24207 // especially clever.
24208 if (SSID == SyncScope::SingleThread)
24209 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
24210 // the IR level, so we must wrap it in an intrinsic.
24213 if (!Subtarget.hasMFence())
24214 // FIXME: it might make sense to use a locked operation here but on a
24215 // different cache-line to prevent cache-line bouncing. In practice it
24216 // is probably a small win, and x86 processors without mfence are rare
24217 // enough that we do not bother.
24221 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
24222 Builder.CreateCall(MFence, {});
24224 // Finally we can emit the atomic load.
24225 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
24226 AI->getType()->getPrimitiveSizeInBits());
24227 Loaded->setAtomic(Order, SSID);
24228 AI->replaceAllUsesWith(Loaded);
24229 AI->eraseFromParent();
24233 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
24234 SelectionDAG &DAG) {
24236 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
24237 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
24238 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
24239 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
24241 // The only fence that needs an instruction is a sequentially-consistent
24242 // cross-thread fence.
24243 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
24244 FenceSSID == SyncScope::System) {
24245 if (Subtarget.hasMFence())
24246 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
24248 SDValue Chain = Op.getOperand(0);
24249 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
24251 DAG.getRegister(X86::ESP, MVT::i32), // Base
24252 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
24253 DAG.getRegister(0, MVT::i32), // Index
24254 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
24255 DAG.getRegister(0, MVT::i32), // Segment.
24259 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
24260 return SDValue(Res, 0);
24263 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
24264 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
24267 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
24268 SelectionDAG &DAG) {
24269 MVT T = Op.getSimpleValueType();
24273 switch(T.SimpleTy) {
24274 default: llvm_unreachable("Invalid value type!");
24275 case MVT::i8: Reg = X86::AL; size = 1; break;
24276 case MVT::i16: Reg = X86::AX; size = 2; break;
24277 case MVT::i32: Reg = X86::EAX; size = 4; break;
24279 assert(Subtarget.is64Bit() && "Node not type legal!");
24280 Reg = X86::RAX; size = 8;
24283 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
24284 Op.getOperand(2), SDValue());
24285 SDValue Ops[] = { cpIn.getValue(0),
24288 DAG.getTargetConstant(size, DL, MVT::i8),
24289 cpIn.getValue(1) };
24290 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24291 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
24292 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
24296 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
24297 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
24298 MVT::i32, cpOut.getValue(2));
24299 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
24301 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
24302 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
24303 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
24307 // Create MOVMSKB, taking into account whether we need to split for AVX1.
24308 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
24309 const X86Subtarget &Subtarget) {
24310 MVT InVT = V.getSimpleValueType();
24312 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
24314 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
24315 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
24316 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
24317 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
24318 DAG.getConstant(16, DL, MVT::i8));
24319 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
24322 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24325 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
24326 SelectionDAG &DAG) {
24327 SDValue Src = Op.getOperand(0);
24328 MVT SrcVT = Src.getSimpleValueType();
24329 MVT DstVT = Op.getSimpleValueType();
24331 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
24332 // half to v32i1 and concatenating the result.
24333 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
24334 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
24335 assert(Subtarget.hasBWI() && "Expected BWI target");
24337 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24338 DAG.getIntPtrConstant(0, dl));
24339 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24340 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24341 DAG.getIntPtrConstant(1, dl));
24342 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24343 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24346 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24347 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
24348 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
24351 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
24352 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
24353 DstVT.getVectorNumElements() / 2);
24354 Lo = DAG.getBitcast(CastVT, Lo);
24355 Hi = DAG.getBitcast(CastVT, Hi);
24356 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
24359 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24360 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24361 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24362 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24364 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24365 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24366 return DAG.getZExtOrTrunc(V, DL, DstVT);
24369 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24370 SrcVT == MVT::i64) {
24371 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24372 if (DstVT != MVT::f64)
24373 // This conversion needs to be expanded.
24376 SmallVector<SDValue, 16> Elts;
24380 if (SrcVT.isVector()) {
24381 NumElts = SrcVT.getVectorNumElements();
24382 SVT = SrcVT.getVectorElementType();
24384 // Widen the vector in input in the case of MVT::v2i32.
24385 // Example: from MVT::v2i32 to MVT::v4i32.
24386 for (unsigned i = 0, e = NumElts; i != e; ++i)
24387 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24388 DAG.getIntPtrConstant(i, dl)));
24390 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24391 "Unexpected source type in LowerBITCAST");
24392 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24393 DAG.getIntPtrConstant(0, dl)));
24394 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24395 DAG.getIntPtrConstant(1, dl)));
24399 // Explicitly mark the extra elements as Undef.
24400 Elts.append(NumElts, DAG.getUNDEF(SVT));
24402 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24403 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24404 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24405 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24406 DAG.getIntPtrConstant(0, dl));
24409 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24410 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24411 assert((DstVT == MVT::i64 ||
24412 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24413 "Unexpected custom BITCAST");
24414 // i64 <=> MMX conversions are Legal.
24415 if (SrcVT==MVT::i64 && DstVT.isVector())
24417 if (DstVT==MVT::i64 && SrcVT.isVector())
24419 // MMX <=> MMX conversions are Legal.
24420 if (SrcVT.isVector() && DstVT.isVector())
24422 // All other conversions need to be expanded.
24426 /// Compute the horizontal sum of bytes in V for the elements of VT.
24428 /// Requires V to be a byte vector and VT to be an integer vector type with
24429 /// wider elements than V's type. The width of the elements of VT determines
24430 /// how many bytes of V are summed horizontally to produce each element of the
24432 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24433 const X86Subtarget &Subtarget,
24434 SelectionDAG &DAG) {
24436 MVT ByteVecVT = V.getSimpleValueType();
24437 MVT EltVT = VT.getVectorElementType();
24438 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24439 "Expected value to have byte element type.");
24440 assert(EltVT != MVT::i8 &&
24441 "Horizontal byte sum only makes sense for wider elements!");
24442 unsigned VecSize = VT.getSizeInBits();
24443 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24445 // PSADBW instruction horizontally add all bytes and leave the result in i64
24446 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24447 if (EltVT == MVT::i64) {
24448 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24449 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24450 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24451 return DAG.getBitcast(VT, V);
24454 if (EltVT == MVT::i32) {
24455 // We unpack the low half and high half into i32s interleaved with zeros so
24456 // that we can use PSADBW to horizontally sum them. The most useful part of
24457 // this is that it lines up the results of two PSADBW instructions to be
24458 // two v2i64 vectors which concatenated are the 4 population counts. We can
24459 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24460 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24461 SDValue V32 = DAG.getBitcast(VT, V);
24462 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24463 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24465 // Do the horizontal sums into two v2i64s.
24466 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24467 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24468 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24469 DAG.getBitcast(ByteVecVT, Low), Zeros);
24470 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24471 DAG.getBitcast(ByteVecVT, High), Zeros);
24473 // Merge them together.
24474 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24475 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24476 DAG.getBitcast(ShortVecVT, Low),
24477 DAG.getBitcast(ShortVecVT, High));
24479 return DAG.getBitcast(VT, V);
24482 // The only element type left is i16.
24483 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24485 // To obtain pop count for each i16 element starting from the pop count for
24486 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24487 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24488 // directly supported.
24489 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24490 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24491 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24492 DAG.getBitcast(ByteVecVT, V));
24493 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24496 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24497 const X86Subtarget &Subtarget,
24498 SelectionDAG &DAG) {
24499 MVT VT = Op.getSimpleValueType();
24500 MVT EltVT = VT.getVectorElementType();
24501 unsigned VecSize = VT.getSizeInBits();
24503 // Implement a lookup table in register by using an algorithm based on:
24504 // http://wm.ite.pl/articles/sse-popcount.html
24506 // The general idea is that every lower byte nibble in the input vector is an
24507 // index into a in-register pre-computed pop count table. We then split up the
24508 // input vector in two new ones: (1) a vector with only the shifted-right
24509 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24510 // masked out higher ones) for each byte. PSHUFB is used separately with both
24511 // to index the in-register table. Next, both are added and the result is a
24512 // i8 vector where each element contains the pop count for input byte.
24514 // To obtain the pop count for elements != i8, we follow up with the same
24515 // approach and use additional tricks as described below.
24517 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24518 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24519 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24520 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24522 int NumByteElts = VecSize / 8;
24523 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24524 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24525 SmallVector<SDValue, 64> LUTVec;
24526 for (int i = 0; i < NumByteElts; ++i)
24527 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24528 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24529 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24532 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24533 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24536 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24538 // The input vector is used as the shuffle mask that index elements into the
24539 // LUT. After counting low and high nibbles, add the vector to obtain the
24540 // final pop count per i8 element.
24541 SDValue HighPopCnt =
24542 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24543 SDValue LowPopCnt =
24544 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24545 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24547 if (EltVT == MVT::i8)
24550 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24553 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24554 const X86Subtarget &Subtarget,
24555 SelectionDAG &DAG) {
24556 MVT VT = Op.getSimpleValueType();
24557 assert(VT.is128BitVector() &&
24558 "Only 128-bit vector bitmath lowering supported.");
24560 int VecSize = VT.getSizeInBits();
24561 MVT EltVT = VT.getVectorElementType();
24562 int Len = EltVT.getSizeInBits();
24564 // This is the vectorized version of the "best" algorithm from
24565 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24566 // with a minor tweak to use a series of adds + shifts instead of vector
24567 // multiplications. Implemented for all integer vector types. We only use
24568 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24569 // much faster, even faster than using native popcnt instructions.
24571 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24572 MVT VT = V.getSimpleValueType();
24573 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24574 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24576 auto GetMask = [&](SDValue V, APInt Mask) {
24577 MVT VT = V.getSimpleValueType();
24578 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24579 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24582 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24583 // x86, so set the SRL type to have elements at least i16 wide. This is
24584 // correct because all of our SRLs are followed immediately by a mask anyways
24585 // that handles any bits that sneak into the high bits of the byte elements.
24586 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24590 // v = v - ((v >> 1) & 0x55555555...)
24592 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24593 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24594 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24596 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24597 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24598 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24599 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24600 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24602 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24603 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24604 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24605 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24607 // At this point, V contains the byte-wise population count, and we are
24608 // merely doing a horizontal sum if necessary to get the wider element
24610 if (EltVT == MVT::i8)
24613 return LowerHorizontalByteSum(
24614 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24618 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24619 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24620 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24621 SelectionDAG &DAG) {
24622 MVT VT = Op.getSimpleValueType();
24623 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24624 "Unknown CTPOP type to handle");
24625 SDLoc DL(Op.getNode());
24626 SDValue Op0 = Op.getOperand(0);
24628 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24629 if (Subtarget.hasVPOPCNTDQ()) {
24630 unsigned NumElems = VT.getVectorNumElements();
24631 assert((VT.getVectorElementType() == MVT::i8 ||
24632 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24633 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24634 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24635 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24636 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24637 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24641 if (!Subtarget.hasSSSE3()) {
24642 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24643 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24644 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24647 // Decompose 256-bit ops into smaller 128-bit ops.
24648 if (VT.is256BitVector() && !Subtarget.hasInt256())
24649 return Lower256IntUnary(Op, DAG);
24651 // Decompose 512-bit ops into smaller 256-bit ops.
24652 if (VT.is512BitVector() && !Subtarget.hasBWI())
24653 return Lower512IntUnary(Op, DAG);
24655 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24658 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24659 SelectionDAG &DAG) {
24660 assert(Op.getSimpleValueType().isVector() &&
24661 "We only do custom lowering for vector population count.");
24662 return LowerVectorCTPOP(Op, Subtarget, DAG);
24665 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24666 MVT VT = Op.getSimpleValueType();
24667 SDValue In = Op.getOperand(0);
24670 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24671 // perform the BITREVERSE.
24672 if (!VT.isVector()) {
24673 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24674 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24675 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24676 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24677 DAG.getIntPtrConstant(0, DL));
24680 int NumElts = VT.getVectorNumElements();
24681 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24683 // Decompose 256-bit ops into smaller 128-bit ops.
24684 if (VT.is256BitVector())
24685 return Lower256IntUnary(Op, DAG);
24687 assert(VT.is128BitVector() &&
24688 "Only 128-bit vector bitreverse lowering supported.");
24690 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24691 // perform the BSWAP in the shuffle.
24692 // Its best to shuffle using the second operand as this will implicitly allow
24693 // memory folding for multiple vectors.
24694 SmallVector<SDValue, 16> MaskElts;
24695 for (int i = 0; i != NumElts; ++i) {
24696 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24697 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24698 int PermuteByte = SourceByte | (2 << 5);
24699 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24703 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24704 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24705 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24707 return DAG.getBitcast(VT, Res);
24710 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24711 SelectionDAG &DAG) {
24712 MVT VT = Op.getSimpleValueType();
24714 if (Subtarget.hasXOP() && !VT.is512BitVector())
24715 return LowerBITREVERSE_XOP(Op, DAG);
24717 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24719 SDValue In = Op.getOperand(0);
24722 unsigned NumElts = VT.getVectorNumElements();
24723 assert(VT.getScalarType() == MVT::i8 &&
24724 "Only byte vector BITREVERSE supported");
24726 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24727 if (VT.is256BitVector() && !Subtarget.hasInt256())
24728 return Lower256IntUnary(Op, DAG);
24730 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24731 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24732 // 0-15 value (moved to the other nibble).
24733 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24734 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24735 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24737 const int LoLUT[16] = {
24738 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24739 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24740 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24741 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24742 const int HiLUT[16] = {
24743 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24744 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24745 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24746 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24748 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24749 for (unsigned i = 0; i < NumElts; ++i) {
24750 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24751 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24754 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24755 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24756 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24757 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24758 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24761 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24762 const X86Subtarget &Subtarget,
24763 bool AllowIncDec = true) {
24764 unsigned NewOpc = 0;
24765 switch (N->getOpcode()) {
24766 case ISD::ATOMIC_LOAD_ADD:
24767 NewOpc = X86ISD::LADD;
24769 case ISD::ATOMIC_LOAD_SUB:
24770 NewOpc = X86ISD::LSUB;
24772 case ISD::ATOMIC_LOAD_OR:
24773 NewOpc = X86ISD::LOR;
24775 case ISD::ATOMIC_LOAD_XOR:
24776 NewOpc = X86ISD::LXOR;
24778 case ISD::ATOMIC_LOAD_AND:
24779 NewOpc = X86ISD::LAND;
24782 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24785 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24787 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24788 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24789 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24790 DAG.getMachineFunction().getFunction().optForSize())) {
24791 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24792 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24793 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24794 DAG.getVTList(MVT::i32, MVT::Other),
24795 {N->getOperand(0), N->getOperand(1)},
24796 /*MemVT=*/N->getSimpleValueType(0), MMO);
24797 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24798 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24799 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24800 DAG.getVTList(MVT::i32, MVT::Other),
24801 {N->getOperand(0), N->getOperand(1)},
24802 /*MemVT=*/N->getSimpleValueType(0), MMO);
24806 return DAG.getMemIntrinsicNode(
24807 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24808 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24809 /*MemVT=*/N->getSimpleValueType(0), MMO);
24812 /// Lower atomic_load_ops into LOCK-prefixed operations.
24813 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24814 const X86Subtarget &Subtarget) {
24815 SDValue Chain = N->getOperand(0);
24816 SDValue LHS = N->getOperand(1);
24817 SDValue RHS = N->getOperand(2);
24818 unsigned Opc = N->getOpcode();
24819 MVT VT = N->getSimpleValueType(0);
24822 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24823 // can only be lowered when the result is unused. They should have already
24824 // been transformed into a cmpxchg loop in AtomicExpand.
24825 if (N->hasAnyUseOfValue(0)) {
24826 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24827 // select LXADD if LOCK_SUB can't be selected.
24828 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24829 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24830 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24831 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24832 RHS, AN->getMemOperand());
24834 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24835 "Used AtomicRMW ops other than Add should have been expanded!");
24839 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24840 // RAUW the chain, but don't worry about the result, as it's unused.
24841 assert(!N->hasAnyUseOfValue(0));
24842 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24846 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24847 SDNode *Node = Op.getNode();
24849 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24851 // Convert seq_cst store -> xchg
24852 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24853 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24854 // (The only way to get a 16-byte store is cmpxchg16b)
24855 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24856 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24857 AtomicOrdering::SequentiallyConsistent ||
24858 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24859 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24860 cast<AtomicSDNode>(Node)->getMemoryVT(),
24861 Node->getOperand(0),
24862 Node->getOperand(1), Node->getOperand(2),
24863 cast<AtomicSDNode>(Node)->getMemOperand());
24864 return Swap.getValue(1);
24866 // Other atomic stores have a simple pattern.
24870 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24871 SDNode *N = Op.getNode();
24872 MVT VT = N->getSimpleValueType(0);
24874 // Let legalize expand this if it isn't a legal type yet.
24875 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24878 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24881 // Set the carry flag.
24882 SDValue Carry = Op.getOperand(2);
24883 EVT CarryVT = Carry.getValueType();
24884 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24885 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24886 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24888 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24889 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24890 Op.getOperand(1), Carry.getValue(1));
24892 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24893 if (N->getValueType(1) == MVT::i1)
24894 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24896 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24899 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24900 SelectionDAG &DAG) {
24901 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24903 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24904 // which returns the values as { float, float } (in XMM0) or
24905 // { double, double } (which is returned in XMM0, XMM1).
24907 SDValue Arg = Op.getOperand(0);
24908 EVT ArgVT = Arg.getValueType();
24909 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24911 TargetLowering::ArgListTy Args;
24912 TargetLowering::ArgListEntry Entry;
24916 Entry.IsSExt = false;
24917 Entry.IsZExt = false;
24918 Args.push_back(Entry);
24920 bool isF64 = ArgVT == MVT::f64;
24921 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24922 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24923 // the results are returned via SRet in memory.
24924 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24925 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24926 const char *LibcallName = TLI.getLibcallName(LC);
24928 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24930 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24931 : (Type *)VectorType::get(ArgTy, 4);
24933 TargetLowering::CallLoweringInfo CLI(DAG);
24934 CLI.setDebugLoc(dl)
24935 .setChain(DAG.getEntryNode())
24936 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24938 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24941 // Returned in xmm0 and xmm1.
24942 return CallResult.first;
24944 // Returned in bits 0:31 and 32:64 xmm0.
24945 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24946 CallResult.first, DAG.getIntPtrConstant(0, dl));
24947 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24948 CallResult.first, DAG.getIntPtrConstant(1, dl));
24949 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24950 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24953 /// Widen a vector input to a vector of NVT. The
24954 /// input vector must have the same element type as NVT.
24955 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24956 bool FillWithZeroes = false) {
24957 // Check if InOp already has the right width.
24958 MVT InVT = InOp.getSimpleValueType();
24962 if (InOp.isUndef())
24963 return DAG.getUNDEF(NVT);
24965 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24966 "input and widen element type must match");
24968 unsigned InNumElts = InVT.getVectorNumElements();
24969 unsigned WidenNumElts = NVT.getVectorNumElements();
24970 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24971 "Unexpected request for vector widening");
24974 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24975 InOp.getNumOperands() == 2) {
24976 SDValue N1 = InOp.getOperand(1);
24977 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24979 InOp = InOp.getOperand(0);
24980 InVT = InOp.getSimpleValueType();
24981 InNumElts = InVT.getVectorNumElements();
24984 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24985 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24986 SmallVector<SDValue, 16> Ops;
24987 for (unsigned i = 0; i < InNumElts; ++i)
24988 Ops.push_back(InOp.getOperand(i));
24990 EVT EltVT = InOp.getOperand(0).getValueType();
24992 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24993 DAG.getUNDEF(EltVT);
24994 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24995 Ops.push_back(FillVal);
24996 return DAG.getBuildVector(NVT, dl, Ops);
24998 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
25000 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
25001 InOp, DAG.getIntPtrConstant(0, dl));
25004 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
25005 SelectionDAG &DAG) {
25006 assert(Subtarget.hasAVX512() &&
25007 "MGATHER/MSCATTER are supported on AVX-512 arch only");
25009 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
25010 SDValue Src = N->getValue();
25011 MVT VT = Src.getSimpleValueType();
25012 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
25015 SDValue Scale = N->getScale();
25016 SDValue Index = N->getIndex();
25017 SDValue Mask = N->getMask();
25018 SDValue Chain = N->getChain();
25019 SDValue BasePtr = N->getBasePtr();
25021 if (VT == MVT::v2f32) {
25022 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25023 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25024 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25025 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25026 DAG.getUNDEF(MVT::v2f32));
25027 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25028 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25029 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25030 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25031 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25032 return SDValue(NewScatter.getNode(), 1);
25037 if (VT == MVT::v2i32) {
25038 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25039 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
25040 DAG.getUNDEF(MVT::v2i32));
25041 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25042 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25043 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25044 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25045 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25046 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25047 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25048 return SDValue(NewScatter.getNode(), 1);
25050 // Custom widen all the operands to avoid promotion.
25051 EVT NewIndexVT = EVT::getVectorVT(
25052 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
25053 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25054 DAG.getUNDEF(Index.getValueType()));
25055 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25056 DAG.getConstant(0, dl, MVT::v2i1));
25057 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25058 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
25059 Ops, N->getMemOperand());
25062 MVT IndexVT = Index.getSimpleValueType();
25063 MVT MaskVT = Mask.getSimpleValueType();
25065 // If the index is v2i32, we're being called by type legalization and we
25066 // should just let the default handling take care of it.
25067 if (IndexVT == MVT::v2i32)
25070 // If we don't have VLX and neither the passthru or index is 512-bits, we
25071 // need to widen until one is.
25072 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
25073 !Index.getSimpleValueType().is512BitVector()) {
25074 // Determine how much we need to widen by to get a 512-bit type.
25075 unsigned Factor = std::min(512/VT.getSizeInBits(),
25076 512/IndexVT.getSizeInBits());
25077 unsigned NumElts = VT.getVectorNumElements() * Factor;
25079 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25080 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25081 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25083 Src = ExtendToType(Src, VT, DAG);
25084 Index = ExtendToType(Index, IndexVT, DAG);
25085 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25088 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
25089 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25090 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25091 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25092 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25093 return SDValue(NewScatter.getNode(), 1);
25096 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
25097 SelectionDAG &DAG) {
25099 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
25100 MVT VT = Op.getSimpleValueType();
25101 MVT ScalarVT = VT.getScalarType();
25102 SDValue Mask = N->getMask();
25105 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
25106 "Expanding masked load is supported on AVX-512 target only!");
25108 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
25109 "Expanding masked load is supported for 32 and 64-bit types only!");
25111 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25112 "Cannot lower masked load op.");
25114 assert((ScalarVT.getSizeInBits() >= 32 ||
25115 (Subtarget.hasBWI() &&
25116 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25117 "Unsupported masked load op.");
25119 // This operation is legal for targets with VLX, but without
25120 // VLX the vector should be widened to 512 bit
25121 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
25122 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25123 SDValue Src0 = N->getSrc0();
25124 Src0 = ExtendToType(Src0, WideDataVT, DAG);
25126 // Mask element has to be i1.
25127 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25128 "Unexpected mask type");
25130 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25132 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25133 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
25134 N->getBasePtr(), Mask, Src0,
25135 N->getMemoryVT(), N->getMemOperand(),
25136 N->getExtensionType(),
25137 N->isExpandingLoad());
25139 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
25140 NewLoad.getValue(0),
25141 DAG.getIntPtrConstant(0, dl));
25142 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
25143 return DAG.getMergeValues(RetOps, dl);
25146 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
25147 SelectionDAG &DAG) {
25148 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
25149 SDValue DataToStore = N->getValue();
25150 MVT VT = DataToStore.getSimpleValueType();
25151 MVT ScalarVT = VT.getScalarType();
25152 SDValue Mask = N->getMask();
25155 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
25156 "Expanding masked load is supported on AVX-512 target only!");
25158 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
25159 "Expanding masked load is supported for 32 and 64-bit types only!");
25161 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25162 "Cannot lower masked store op.");
25164 assert((ScalarVT.getSizeInBits() >= 32 ||
25165 (Subtarget.hasBWI() &&
25166 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25167 "Unsupported masked store op.");
25169 // This operation is legal for targets with VLX, but without
25170 // VLX the vector should be widened to 512 bit
25171 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
25172 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25174 // Mask element has to be i1.
25175 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25176 "Unexpected mask type");
25178 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25180 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
25181 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25182 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
25183 Mask, N->getMemoryVT(), N->getMemOperand(),
25184 N->isTruncatingStore(), N->isCompressingStore());
25187 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
25188 SelectionDAG &DAG) {
25189 assert(Subtarget.hasAVX2() &&
25190 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
25192 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
25194 MVT VT = Op.getSimpleValueType();
25195 SDValue Index = N->getIndex();
25196 SDValue Mask = N->getMask();
25197 SDValue Src0 = N->getValue();
25198 MVT IndexVT = Index.getSimpleValueType();
25199 MVT MaskVT = Mask.getSimpleValueType();
25201 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
25203 // If the index is v2i32, we're being called by type legalization.
25204 if (IndexVT == MVT::v2i32)
25207 // If we don't have VLX and neither the passthru or index is 512-bits, we
25208 // need to widen until one is.
25210 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25211 !IndexVT.is512BitVector()) {
25212 // Determine how much we need to widen by to get a 512-bit type.
25213 unsigned Factor = std::min(512/VT.getSizeInBits(),
25214 512/IndexVT.getSizeInBits());
25216 unsigned NumElts = VT.getVectorNumElements() * Factor;
25218 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25219 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25220 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25222 Src0 = ExtendToType(Src0, VT, DAG);
25223 Index = ExtendToType(Index, IndexVT, DAG);
25224 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25227 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
25229 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25230 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
25231 N->getMemOperand());
25232 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
25233 NewGather, DAG.getIntPtrConstant(0, dl));
25234 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
25237 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
25238 SelectionDAG &DAG) const {
25239 // TODO: Eventually, the lowering of these nodes should be informed by or
25240 // deferred to the GC strategy for the function in which they appear. For
25241 // now, however, they must be lowered to something. Since they are logically
25242 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25243 // require special handling for these nodes), lower them as literal NOOPs for
25245 SmallVector<SDValue, 2> Ops;
25247 Ops.push_back(Op.getOperand(0));
25248 if (Op->getGluedNode())
25249 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25252 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25253 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25258 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
25259 SelectionDAG &DAG) const {
25260 // TODO: Eventually, the lowering of these nodes should be informed by or
25261 // deferred to the GC strategy for the function in which they appear. For
25262 // now, however, they must be lowered to something. Since they are logically
25263 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25264 // require special handling for these nodes), lower them as literal NOOPs for
25266 SmallVector<SDValue, 2> Ops;
25268 Ops.push_back(Op.getOperand(0));
25269 if (Op->getGluedNode())
25270 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25273 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25274 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25279 /// Provide custom lowering hooks for some operations.
25280 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
25281 switch (Op.getOpcode()) {
25282 default: llvm_unreachable("Should not custom lower this!");
25283 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
25284 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
25285 return LowerCMP_SWAP(Op, Subtarget, DAG);
25286 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
25287 case ISD::ATOMIC_LOAD_ADD:
25288 case ISD::ATOMIC_LOAD_SUB:
25289 case ISD::ATOMIC_LOAD_OR:
25290 case ISD::ATOMIC_LOAD_XOR:
25291 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
25292 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
25293 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
25294 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
25295 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
25296 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
25297 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
25298 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
25299 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
25300 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
25301 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
25302 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
25303 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
25304 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
25305 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
25306 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
25307 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
25308 case ISD::SHL_PARTS:
25309 case ISD::SRA_PARTS:
25310 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
25311 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
25312 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
25313 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
25314 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
25315 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
25316 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
25317 case ISD::ZERO_EXTEND_VECTOR_INREG:
25318 case ISD::SIGN_EXTEND_VECTOR_INREG:
25319 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
25320 case ISD::FP_TO_SINT:
25321 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
25322 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
25323 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
25324 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
25326 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
25327 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
25328 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
25329 case ISD::SETCC: return LowerSETCC(Op, DAG);
25330 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
25331 case ISD::SELECT: return LowerSELECT(Op, DAG);
25332 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
25333 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
25334 case ISD::VASTART: return LowerVASTART(Op, DAG);
25335 case ISD::VAARG: return LowerVAARG(Op, DAG);
25336 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
25337 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
25338 case ISD::INTRINSIC_VOID:
25339 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
25340 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
25341 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
25342 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
25343 case ISD::FRAME_TO_ARGS_OFFSET:
25344 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
25345 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
25346 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25347 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25348 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25349 case ISD::EH_SJLJ_SETUP_DISPATCH:
25350 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25351 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25352 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25353 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25355 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25357 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25358 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25360 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25361 case ISD::UMUL_LOHI:
25362 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25364 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25367 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25373 case ISD::UMULO: return LowerXALUO(Op, DAG);
25374 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25375 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25376 case ISD::ADDCARRY:
25377 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25379 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25383 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25384 case ISD::ABS: return LowerABS(Op, DAG);
25385 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25386 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25387 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25388 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25389 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25390 case ISD::GC_TRANSITION_START:
25391 return LowerGC_TRANSITION_START(Op, DAG);
25392 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25396 /// Places new result values for the node in Results (their number
25397 /// and types must exactly match those of the original return values of
25398 /// the node), or leaves Results empty, which indicates that the node is not
25399 /// to be custom lowered after all.
25400 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25401 SmallVectorImpl<SDValue> &Results,
25402 SelectionDAG &DAG) const {
25403 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25405 if (!Res.getNode())
25408 assert((N->getNumValues() <= Res->getNumValues()) &&
25409 "Lowering returned the wrong number of results!");
25411 // Places new result values base on N result number.
25412 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25413 // than original node, chain should be dropped(last value).
25414 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25415 Results.push_back(Res.getValue(I));
25418 /// Replace a node with an illegal result type with a new node built out of
25420 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25421 SmallVectorImpl<SDValue>&Results,
25422 SelectionDAG &DAG) const {
25424 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25425 switch (N->getOpcode()) {
25427 llvm_unreachable("Do not know how to custom type legalize this operation!");
25428 case X86ISD::AVG: {
25429 // Legalize types for X86ISD::AVG by expanding vectors.
25430 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25432 auto InVT = N->getValueType(0);
25433 assert(InVT.getSizeInBits() < 128);
25434 assert(128 % InVT.getSizeInBits() == 0);
25435 unsigned NumConcat = 128 / InVT.getSizeInBits();
25437 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25438 InVT.getVectorElementType(),
25439 NumConcat * InVT.getVectorNumElements());
25441 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25442 Ops[0] = N->getOperand(0);
25443 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25444 Ops[0] = N->getOperand(1);
25445 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25447 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25448 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25449 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25450 DAG.getIntPtrConstant(0, dl));
25451 Results.push_back(Res);
25455 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25456 // setCC result type is v2i1 because type legalzation will end up with
25457 // a v4i1 setcc plus an extend.
25458 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25459 if (N->getOperand(0).getValueType() != MVT::v2f32)
25461 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25462 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25463 N->getOperand(0), UNDEF);
25464 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25465 N->getOperand(1), UNDEF);
25466 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25468 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25469 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25470 DAG.getIntPtrConstant(0, dl));
25471 Results.push_back(Res);
25474 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25475 case X86ISD::FMINC:
25477 case X86ISD::FMAXC:
25478 case X86ISD::FMAX: {
25479 EVT VT = N->getValueType(0);
25480 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25481 SDValue UNDEF = DAG.getUNDEF(VT);
25482 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25483 N->getOperand(0), UNDEF);
25484 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25485 N->getOperand(1), UNDEF);
25486 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25494 case ISD::UDIVREM: {
25495 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25496 Results.push_back(V);
25499 case ISD::FP_TO_SINT:
25500 case ISD::FP_TO_UINT: {
25501 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25502 EVT VT = N->getValueType(0);
25503 SDValue Src = N->getOperand(0);
25504 EVT SrcVT = Src.getValueType();
25506 if (VT == MVT::v2i32) {
25507 assert((IsSigned || Subtarget.hasAVX512()) &&
25508 "Can only handle signed conversion without AVX512");
25509 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25510 if (Src.getValueType() == MVT::v2f64) {
25511 MVT ResVT = MVT::v4i32;
25512 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25513 if (!IsSigned && !Subtarget.hasVLX()) {
25514 // Widen to 512-bits.
25515 ResVT = MVT::v8i32;
25516 Opc = ISD::FP_TO_UINT;
25517 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25518 DAG.getUNDEF(MVT::v8f64),
25519 Src, DAG.getIntPtrConstant(0, dl));
25521 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25522 bool WidenType = getTypeAction(*DAG.getContext(),
25523 MVT::v2i32) == TypeWidenVector;
25524 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25525 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25526 DAG.getIntPtrConstant(0, dl));
25527 Results.push_back(Res);
25530 if (SrcVT == MVT::v2f32) {
25531 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25532 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25533 DAG.getUNDEF(MVT::v2f32));
25534 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25535 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25536 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25537 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25538 Results.push_back(Res);
25542 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25543 // so early out here.
25547 if (Subtarget.hasDQI() && VT == MVT::i64 &&
25548 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
25549 assert(!Subtarget.is64Bit() && "i64 should be legal");
25550 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
25551 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
25552 // TODO: Use 128-bit vectors for f64 case?
25553 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
25554 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
25555 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
25557 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
25558 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
25559 DAG.getConstantFP(0.0, dl, VecInVT), Src,
25561 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
25562 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
25563 Results.push_back(Res);
25567 std::pair<SDValue,SDValue> Vals =
25568 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25569 SDValue FIST = Vals.first, StackSlot = Vals.second;
25570 if (FIST.getNode()) {
25571 // Return a load from the stack slot.
25572 if (StackSlot.getNode())
25574 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25576 Results.push_back(FIST);
25580 case ISD::SINT_TO_FP: {
25581 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25582 SDValue Src = N->getOperand(0);
25583 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25585 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25588 case ISD::UINT_TO_FP: {
25589 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25590 EVT VT = N->getValueType(0);
25591 if (VT != MVT::v2f32)
25593 SDValue Src = N->getOperand(0);
25594 EVT SrcVT = Src.getValueType();
25595 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25596 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25599 if (SrcVT != MVT::v2i32)
25601 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25603 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25604 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25605 DAG.getBitcast(MVT::v2i64, VBias));
25606 Or = DAG.getBitcast(MVT::v2f64, Or);
25607 // TODO: Are there any fast-math-flags to propagate here?
25608 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25609 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25612 case ISD::FP_ROUND: {
25613 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25615 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25616 Results.push_back(V);
25619 case ISD::FP_EXTEND: {
25620 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25621 // No other ValueType for FP_EXTEND should reach this point.
25622 assert(N->getValueType(0) == MVT::v2f32 &&
25623 "Do not know how to legalize this Node");
25626 case ISD::INTRINSIC_W_CHAIN: {
25627 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25629 default : llvm_unreachable("Do not know how to custom type "
25630 "legalize this intrinsic operation!");
25631 case Intrinsic::x86_rdtsc:
25632 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25634 case Intrinsic::x86_rdtscp:
25635 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25637 case Intrinsic::x86_rdpmc:
25638 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25640 case Intrinsic::x86_xgetbv:
25641 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25644 case ISD::INTRINSIC_WO_CHAIN: {
25645 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25646 Results.push_back(V);
25649 case ISD::READCYCLECOUNTER: {
25650 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25653 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25654 EVT T = N->getValueType(0);
25655 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25656 bool Regs64bit = T == MVT::i128;
25657 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25658 SDValue cpInL, cpInH;
25659 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25660 DAG.getConstant(0, dl, HalfT));
25661 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25662 DAG.getConstant(1, dl, HalfT));
25663 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25664 Regs64bit ? X86::RAX : X86::EAX,
25666 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25667 Regs64bit ? X86::RDX : X86::EDX,
25668 cpInH, cpInL.getValue(1));
25669 SDValue swapInL, swapInH;
25670 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25671 DAG.getConstant(0, dl, HalfT));
25672 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25673 DAG.getConstant(1, dl, HalfT));
25675 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25676 swapInH, cpInH.getValue(1));
25677 // If the current function needs the base pointer, RBX,
25678 // we shouldn't use cmpxchg directly.
25679 // Indeed the lowering of that instruction will clobber
25680 // that register and since RBX will be a reserved register
25681 // the register allocator will not make sure its value will
25682 // be properly saved and restored around this live-range.
25683 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25685 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25686 unsigned BasePtr = TRI->getBaseRegister();
25687 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25688 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25689 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25690 // ISel prefers the LCMPXCHG64 variant.
25691 // If that assert breaks, that means it is not the case anymore,
25692 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25693 // not just EBX. This is a matter of accepting i64 input for that
25694 // pseudo, and restoring into the register of the right wide
25695 // in expand pseudo. Everything else should just work.
25696 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25697 "Saving only half of the RBX");
25698 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25699 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25700 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25701 Regs64bit ? X86::RBX : X86::EBX,
25702 HalfT, swapInH.getValue(1));
25703 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25705 /*Glue*/ RBXSave.getValue(2)};
25706 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25709 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25710 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25711 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25712 swapInH.getValue(1));
25713 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25714 swapInL.getValue(1)};
25715 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25717 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25718 Regs64bit ? X86::RAX : X86::EAX,
25719 HalfT, Result.getValue(1));
25720 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25721 Regs64bit ? X86::RDX : X86::EDX,
25722 HalfT, cpOutL.getValue(2));
25723 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25725 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25726 MVT::i32, cpOutH.getValue(2));
25727 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25728 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25730 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25731 Results.push_back(Success);
25732 Results.push_back(EFLAGS.getValue(1));
25735 case ISD::ATOMIC_SWAP:
25736 case ISD::ATOMIC_LOAD_ADD:
25737 case ISD::ATOMIC_LOAD_SUB:
25738 case ISD::ATOMIC_LOAD_AND:
25739 case ISD::ATOMIC_LOAD_OR:
25740 case ISD::ATOMIC_LOAD_XOR:
25741 case ISD::ATOMIC_LOAD_NAND:
25742 case ISD::ATOMIC_LOAD_MIN:
25743 case ISD::ATOMIC_LOAD_MAX:
25744 case ISD::ATOMIC_LOAD_UMIN:
25745 case ISD::ATOMIC_LOAD_UMAX:
25746 case ISD::ATOMIC_LOAD: {
25747 // Delegate to generic TypeLegalization. Situations we can really handle
25748 // should have already been dealt with by AtomicExpandPass.cpp.
25751 case ISD::BITCAST: {
25752 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25753 EVT DstVT = N->getValueType(0);
25754 EVT SrcVT = N->getOperand(0).getValueType();
25756 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25757 // we can split using the k-register rather than memory.
25758 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25759 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25761 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25762 Lo = DAG.getBitcast(MVT::i32, Lo);
25763 Hi = DAG.getBitcast(MVT::i32, Hi);
25764 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25765 Results.push_back(Res);
25769 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25770 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25771 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25773 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25774 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
25775 Lo = DAG.getBitcast(CastVT, Lo);
25776 Hi = DAG.getBitcast(CastVT, Hi);
25777 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25778 Results.push_back(Res);
25782 if (SrcVT != MVT::f64 ||
25783 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25786 unsigned NumElts = DstVT.getVectorNumElements();
25787 EVT SVT = DstVT.getVectorElementType();
25788 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25789 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25790 MVT::v2f64, N->getOperand(0));
25791 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25793 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25794 // If we are legalizing vectors by widening, we already have the desired
25795 // legal vector type, just return it.
25796 Results.push_back(ToVecInt);
25800 SmallVector<SDValue, 8> Elts;
25801 for (unsigned i = 0, e = NumElts; i != e; ++i)
25802 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25803 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25805 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25808 case ISD::MGATHER: {
25809 EVT VT = N->getValueType(0);
25810 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25811 auto *Gather = cast<MaskedGatherSDNode>(N);
25812 SDValue Index = Gather->getIndex();
25813 if (Index.getValueType() != MVT::v2i64)
25815 SDValue Mask = Gather->getMask();
25816 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25817 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25818 Gather->getValue(),
25819 DAG.getUNDEF(MVT::v2f32));
25820 if (!Subtarget.hasVLX()) {
25821 // We need to widen the mask, but the instruction will only use 2
25822 // of its elements. So we can use undef.
25823 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25824 DAG.getUNDEF(MVT::v2i1));
25825 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25827 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25828 Index, Gather->getScale() };
25829 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25830 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25831 Gather->getMemoryVT(), Gather->getMemOperand());
25832 Results.push_back(Res);
25833 Results.push_back(Res.getValue(2));
25836 if (VT == MVT::v2i32) {
25837 auto *Gather = cast<MaskedGatherSDNode>(N);
25838 SDValue Index = Gather->getIndex();
25839 SDValue Mask = Gather->getMask();
25840 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25841 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25842 Gather->getValue(),
25843 DAG.getUNDEF(MVT::v2i32));
25844 // If the index is v2i64 we can use it directly.
25845 if (Index.getValueType() == MVT::v2i64 &&
25846 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25847 if (!Subtarget.hasVLX()) {
25848 // We need to widen the mask, but the instruction will only use 2
25849 // of its elements. So we can use undef.
25850 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25851 DAG.getUNDEF(MVT::v2i1));
25852 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25854 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25855 Index, Gather->getScale() };
25856 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25857 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25858 Gather->getMemoryVT(), Gather->getMemOperand());
25859 SDValue Chain = Res.getValue(2);
25860 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25861 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25862 DAG.getIntPtrConstant(0, dl));
25863 Results.push_back(Res);
25864 Results.push_back(Chain);
25867 EVT IndexVT = Index.getValueType();
25868 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25869 IndexVT.getScalarType(), 4);
25870 // Otherwise we need to custom widen everything to avoid promotion.
25871 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25872 DAG.getUNDEF(IndexVT));
25873 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25874 DAG.getConstant(0, dl, MVT::v2i1));
25875 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25876 Index, Gather->getScale() };
25877 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25878 Gather->getMemoryVT(), dl, Ops,
25879 Gather->getMemOperand());
25880 SDValue Chain = Res.getValue(1);
25881 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25882 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25883 DAG.getIntPtrConstant(0, dl));
25884 Results.push_back(Res);
25885 Results.push_back(Chain);
25893 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25894 switch ((X86ISD::NodeType)Opcode) {
25895 case X86ISD::FIRST_NUMBER: break;
25896 case X86ISD::BSF: return "X86ISD::BSF";
25897 case X86ISD::BSR: return "X86ISD::BSR";
25898 case X86ISD::SHLD: return "X86ISD::SHLD";
25899 case X86ISD::SHRD: return "X86ISD::SHRD";
25900 case X86ISD::FAND: return "X86ISD::FAND";
25901 case X86ISD::FANDN: return "X86ISD::FANDN";
25902 case X86ISD::FOR: return "X86ISD::FOR";
25903 case X86ISD::FXOR: return "X86ISD::FXOR";
25904 case X86ISD::FILD: return "X86ISD::FILD";
25905 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25906 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25907 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25908 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25909 case X86ISD::FLD: return "X86ISD::FLD";
25910 case X86ISD::FST: return "X86ISD::FST";
25911 case X86ISD::CALL: return "X86ISD::CALL";
25912 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25913 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25914 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25915 case X86ISD::BT: return "X86ISD::BT";
25916 case X86ISD::CMP: return "X86ISD::CMP";
25917 case X86ISD::COMI: return "X86ISD::COMI";
25918 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25919 case X86ISD::CMPM: return "X86ISD::CMPM";
25920 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25921 case X86ISD::SETCC: return "X86ISD::SETCC";
25922 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25923 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25924 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25925 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25926 case X86ISD::CMOV: return "X86ISD::CMOV";
25927 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25928 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25929 case X86ISD::IRET: return "X86ISD::IRET";
25930 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25931 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25932 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25933 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25934 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25935 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25936 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25937 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25938 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25939 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25940 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25941 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25942 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25943 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25944 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25945 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25946 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25947 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25948 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25949 case X86ISD::HADD: return "X86ISD::HADD";
25950 case X86ISD::HSUB: return "X86ISD::HSUB";
25951 case X86ISD::FHADD: return "X86ISD::FHADD";
25952 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25953 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25954 case X86ISD::FMAX: return "X86ISD::FMAX";
25955 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25956 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25957 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25958 case X86ISD::FMIN: return "X86ISD::FMIN";
25959 case X86ISD::FMINS: return "X86ISD::FMINS";
25960 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25961 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25962 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25963 case X86ISD::FMINC: return "X86ISD::FMINC";
25964 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25965 case X86ISD::FRCP: return "X86ISD::FRCP";
25966 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25967 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25968 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25969 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25970 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25971 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25972 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25973 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25974 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25975 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25976 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25977 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25978 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25979 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25980 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25981 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25982 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25983 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25984 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25985 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25986 case X86ISD::LADD: return "X86ISD::LADD";
25987 case X86ISD::LSUB: return "X86ISD::LSUB";
25988 case X86ISD::LOR: return "X86ISD::LOR";
25989 case X86ISD::LXOR: return "X86ISD::LXOR";
25990 case X86ISD::LAND: return "X86ISD::LAND";
25991 case X86ISD::LINC: return "X86ISD::LINC";
25992 case X86ISD::LDEC: return "X86ISD::LDEC";
25993 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25994 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25995 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25996 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25997 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25998 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25999 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
26000 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
26001 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
26002 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
26003 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
26004 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
26005 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
26006 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
26007 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
26008 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
26009 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
26010 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
26011 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
26012 case X86ISD::VSHL: return "X86ISD::VSHL";
26013 case X86ISD::VSRL: return "X86ISD::VSRL";
26014 case X86ISD::VSRA: return "X86ISD::VSRA";
26015 case X86ISD::VSHLI: return "X86ISD::VSHLI";
26016 case X86ISD::VSRLI: return "X86ISD::VSRLI";
26017 case X86ISD::VSRAI: return "X86ISD::VSRAI";
26018 case X86ISD::VSRAV: return "X86ISD::VSRAV";
26019 case X86ISD::VROTLI: return "X86ISD::VROTLI";
26020 case X86ISD::VROTRI: return "X86ISD::VROTRI";
26021 case X86ISD::VPPERM: return "X86ISD::VPPERM";
26022 case X86ISD::CMPP: return "X86ISD::CMPP";
26023 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
26024 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
26025 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
26026 case X86ISD::ADD: return "X86ISD::ADD";
26027 case X86ISD::SUB: return "X86ISD::SUB";
26028 case X86ISD::ADC: return "X86ISD::ADC";
26029 case X86ISD::SBB: return "X86ISD::SBB";
26030 case X86ISD::SMUL: return "X86ISD::SMUL";
26031 case X86ISD::UMUL: return "X86ISD::UMUL";
26032 case X86ISD::SMUL8: return "X86ISD::SMUL8";
26033 case X86ISD::UMUL8: return "X86ISD::UMUL8";
26034 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
26035 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
26036 case X86ISD::INC: return "X86ISD::INC";
26037 case X86ISD::DEC: return "X86ISD::DEC";
26038 case X86ISD::OR: return "X86ISD::OR";
26039 case X86ISD::XOR: return "X86ISD::XOR";
26040 case X86ISD::AND: return "X86ISD::AND";
26041 case X86ISD::BEXTR: return "X86ISD::BEXTR";
26042 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
26043 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
26044 case X86ISD::PTEST: return "X86ISD::PTEST";
26045 case X86ISD::TESTP: return "X86ISD::TESTP";
26046 case X86ISD::KORTEST: return "X86ISD::KORTEST";
26047 case X86ISD::KTEST: return "X86ISD::KTEST";
26048 case X86ISD::KADD: return "X86ISD::KADD";
26049 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
26050 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
26051 case X86ISD::PACKSS: return "X86ISD::PACKSS";
26052 case X86ISD::PACKUS: return "X86ISD::PACKUS";
26053 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
26054 case X86ISD::VALIGN: return "X86ISD::VALIGN";
26055 case X86ISD::VSHLD: return "X86ISD::VSHLD";
26056 case X86ISD::VSHRD: return "X86ISD::VSHRD";
26057 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
26058 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
26059 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
26060 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
26061 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
26062 case X86ISD::SHUFP: return "X86ISD::SHUFP";
26063 case X86ISD::SHUF128: return "X86ISD::SHUF128";
26064 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
26065 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
26066 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
26067 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
26068 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
26069 case X86ISD::MOVSD: return "X86ISD::MOVSD";
26070 case X86ISD::MOVSS: return "X86ISD::MOVSS";
26071 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
26072 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
26073 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
26074 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
26075 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
26076 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
26077 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
26078 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
26079 case X86ISD::VPERMV: return "X86ISD::VPERMV";
26080 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
26081 case X86ISD::VPERMI: return "X86ISD::VPERMI";
26082 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
26083 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
26084 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
26085 case X86ISD::VRANGE: return "X86ISD::VRANGE";
26086 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
26087 case X86ISD::VRANGES: return "X86ISD::VRANGES";
26088 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
26089 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
26090 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
26091 case X86ISD::PSADBW: return "X86ISD::PSADBW";
26092 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
26093 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
26094 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
26095 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
26096 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
26097 case X86ISD::MFENCE: return "X86ISD::MFENCE";
26098 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
26099 case X86ISD::SAHF: return "X86ISD::SAHF";
26100 case X86ISD::RDRAND: return "X86ISD::RDRAND";
26101 case X86ISD::RDSEED: return "X86ISD::RDSEED";
26102 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
26103 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
26104 case X86ISD::VPSHA: return "X86ISD::VPSHA";
26105 case X86ISD::VPSHL: return "X86ISD::VPSHL";
26106 case X86ISD::VPCOM: return "X86ISD::VPCOM";
26107 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
26108 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
26109 case X86ISD::FMSUB: return "X86ISD::FMSUB";
26110 case X86ISD::FNMADD: return "X86ISD::FNMADD";
26111 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
26112 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
26113 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
26114 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
26115 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
26116 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
26117 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
26118 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
26119 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
26120 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
26121 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
26122 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
26123 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
26124 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
26125 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
26126 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
26127 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
26128 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
26129 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
26130 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
26131 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
26132 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
26133 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
26134 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
26135 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
26136 case X86ISD::XTEST: return "X86ISD::XTEST";
26137 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
26138 case X86ISD::EXPAND: return "X86ISD::EXPAND";
26139 case X86ISD::SELECT: return "X86ISD::SELECT";
26140 case X86ISD::SELECTS: return "X86ISD::SELECTS";
26141 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
26142 case X86ISD::RCP14: return "X86ISD::RCP14";
26143 case X86ISD::RCP14S: return "X86ISD::RCP14S";
26144 case X86ISD::RCP28: return "X86ISD::RCP28";
26145 case X86ISD::RCP28S: return "X86ISD::RCP28S";
26146 case X86ISD::EXP2: return "X86ISD::EXP2";
26147 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
26148 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
26149 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
26150 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
26151 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
26152 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
26153 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
26154 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
26155 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
26156 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
26157 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
26158 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
26159 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
26160 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
26161 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
26162 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
26163 case X86ISD::SCALEF: return "X86ISD::SCALEF";
26164 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
26165 case X86ISD::ADDS: return "X86ISD::ADDS";
26166 case X86ISD::SUBS: return "X86ISD::SUBS";
26167 case X86ISD::AVG: return "X86ISD::AVG";
26168 case X86ISD::MULHRS: return "X86ISD::MULHRS";
26169 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
26170 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
26171 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
26172 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
26173 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
26174 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
26175 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
26176 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
26177 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
26178 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
26179 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
26180 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
26181 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
26182 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
26183 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
26184 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
26185 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
26186 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
26187 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
26188 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
26189 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
26190 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
26191 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
26192 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
26193 case X86ISD::LWPINS: return "X86ISD::LWPINS";
26194 case X86ISD::MGATHER: return "X86ISD::MGATHER";
26195 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
26196 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
26197 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
26198 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
26199 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
26200 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
26201 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
26202 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
26203 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
26204 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
26205 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
26206 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
26207 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
26212 /// Return true if the addressing mode represented by AM is legal for this
26213 /// target, for a load/store of the specified type.
26214 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
26215 const AddrMode &AM, Type *Ty,
26217 Instruction *I) const {
26218 // X86 supports extremely general addressing modes.
26219 CodeModel::Model M = getTargetMachine().getCodeModel();
26221 // X86 allows a sign-extended 32-bit immediate field as a displacement.
26222 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
26226 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
26228 // If a reference to this global requires an extra load, we can't fold it.
26229 if (isGlobalStubReference(GVFlags))
26232 // If BaseGV requires a register for the PIC base, we cannot also have a
26233 // BaseReg specified.
26234 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
26237 // If lower 4G is not available, then we must use rip-relative addressing.
26238 if ((M != CodeModel::Small || isPositionIndependent()) &&
26239 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
26243 switch (AM.Scale) {
26249 // These scales always work.
26254 // These scales are formed with basereg+scalereg. Only accept if there is
26259 default: // Other stuff never works.
26266 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
26267 unsigned Bits = Ty->getScalarSizeInBits();
26269 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
26270 // particularly cheaper than those without.
26274 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
26275 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
26276 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
26279 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
26280 // shifts just as cheap as scalar ones.
26281 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
26284 // AVX512BW has shifts such as vpsllvw.
26285 if (Subtarget.hasBWI() && Bits == 16)
26288 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
26289 // fully general vector.
26293 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
26294 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26296 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
26297 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
26298 return NumBits1 > NumBits2;
26301 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
26302 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26305 if (!isTypeLegal(EVT::getEVT(Ty1)))
26308 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
26310 // Assuming the caller doesn't have a zeroext or signext return parameter,
26311 // truncation all the way down to i1 is valid.
26315 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
26316 return isInt<32>(Imm);
26319 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
26320 // Can also use sub to handle negated immediates.
26321 return isInt<32>(Imm);
26324 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
26325 if (!VT1.isInteger() || !VT2.isInteger())
26327 unsigned NumBits1 = VT1.getSizeInBits();
26328 unsigned NumBits2 = VT2.getSizeInBits();
26329 return NumBits1 > NumBits2;
26332 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
26333 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26334 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
26337 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
26338 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26339 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
26342 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
26343 EVT VT1 = Val.getValueType();
26344 if (isZExtFree(VT1, VT2))
26347 if (Val.getOpcode() != ISD::LOAD)
26350 if (!VT1.isSimple() || !VT1.isInteger() ||
26351 !VT2.isSimple() || !VT2.isInteger())
26354 switch (VT1.getSimpleVT().SimpleTy) {
26359 // X86 has 8, 16, and 32-bit zero-extending loads.
26366 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26367 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26369 // There is no extending load for vXi1.
26370 if (SrcVT.getScalarType() == MVT::i1)
26377 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26378 if (!Subtarget.hasAnyFMA())
26381 VT = VT.getScalarType();
26383 if (!VT.isSimple())
26386 switch (VT.getSimpleVT().SimpleTy) {
26397 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26398 // i16 instructions are longer (0x66 prefix) and potentially slower.
26399 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26402 /// Targets can use this to indicate that they only support *some*
26403 /// VECTOR_SHUFFLE operations, those with specific masks.
26404 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26405 /// are assumed to be legal.
26406 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26407 if (!VT.isSimple())
26410 // Not for i1 vectors
26411 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26414 // Very little shuffling can be done for 64-bit vectors right now.
26415 if (VT.getSimpleVT().getSizeInBits() == 64)
26418 // We only care that the types being shuffled are legal. The lowering can
26419 // handle any possible shuffle mask that results.
26420 return isTypeLegal(VT.getSimpleVT());
26423 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
26425 // Don't convert an 'and' into a shuffle that we don't directly support.
26426 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
26427 if (!Subtarget.hasAVX2())
26428 if (VT == MVT::v32i8 || VT == MVT::v16i16)
26431 // Just delegate to the generic legality, clear masks aren't special.
26432 return isShuffleMaskLegal(Mask, VT);
26435 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26436 // If the subtarget is using retpolines, we need to not generate jump tables.
26437 if (Subtarget.useRetpoline())
26440 // Otherwise, fallback on the generic logic.
26441 return TargetLowering::areJTsAllowed(Fn);
26444 //===----------------------------------------------------------------------===//
26445 // X86 Scheduler Hooks
26446 //===----------------------------------------------------------------------===//
26448 /// Utility function to emit xbegin specifying the start of an RTM region.
26449 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26450 const TargetInstrInfo *TII) {
26451 DebugLoc DL = MI.getDebugLoc();
26453 const BasicBlock *BB = MBB->getBasicBlock();
26454 MachineFunction::iterator I = ++MBB->getIterator();
26456 // For the v = xbegin(), we generate
26465 // eax = # XABORT_DEF
26469 // v = phi(s0/mainBB, s1/fallBB)
26471 MachineBasicBlock *thisMBB = MBB;
26472 MachineFunction *MF = MBB->getParent();
26473 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26474 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26475 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26476 MF->insert(I, mainMBB);
26477 MF->insert(I, fallMBB);
26478 MF->insert(I, sinkMBB);
26480 // Transfer the remainder of BB and its successor edges to sinkMBB.
26481 sinkMBB->splice(sinkMBB->begin(), MBB,
26482 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26483 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26485 MachineRegisterInfo &MRI = MF->getRegInfo();
26486 unsigned DstReg = MI.getOperand(0).getReg();
26487 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26488 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26489 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26493 // # fallthrough to mainMBB
26494 // # abortion to fallMBB
26495 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26496 thisMBB->addSuccessor(mainMBB);
26497 thisMBB->addSuccessor(fallMBB);
26500 // mainDstReg := -1
26501 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26502 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26503 mainMBB->addSuccessor(sinkMBB);
26506 // ; pseudo instruction to model hardware's definition from XABORT
26507 // EAX := XABORT_DEF
26508 // fallDstReg := EAX
26509 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26510 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26512 fallMBB->addSuccessor(sinkMBB);
26515 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26516 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26517 .addReg(mainDstReg).addMBB(mainMBB)
26518 .addReg(fallDstReg).addMBB(fallMBB);
26520 MI.eraseFromParent();
26524 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26525 const X86Subtarget &Subtarget) {
26526 DebugLoc dl = MI.getDebugLoc();
26527 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26529 // insert input VAL into EAX
26530 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26531 .addReg(MI.getOperand(0).getReg());
26532 // insert zero to ECX
26533 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26535 // insert zero to EDX
26536 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26538 // insert WRPKRU instruction
26539 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26541 MI.eraseFromParent(); // The pseudo is gone now.
26545 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26546 const X86Subtarget &Subtarget) {
26547 DebugLoc dl = MI.getDebugLoc();
26548 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26550 // insert zero to ECX
26551 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26553 // insert RDPKRU instruction
26554 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26555 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26558 MI.eraseFromParent(); // The pseudo is gone now.
26562 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26563 const X86Subtarget &Subtarget,
26565 DebugLoc dl = MI.getDebugLoc();
26566 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26567 // Address into RAX/EAX, other two args into ECX, EDX.
26568 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26569 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26570 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26571 for (int i = 0; i < X86::AddrNumOperands; ++i)
26572 MIB.add(MI.getOperand(i));
26574 unsigned ValOps = X86::AddrNumOperands;
26575 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26576 .addReg(MI.getOperand(ValOps).getReg());
26577 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26578 .addReg(MI.getOperand(ValOps + 1).getReg());
26580 // The instruction doesn't actually take any operands though.
26581 BuildMI(*BB, MI, dl, TII->get(Opc));
26583 MI.eraseFromParent(); // The pseudo is gone now.
26587 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26588 const X86Subtarget &Subtarget) {
26589 DebugLoc dl = MI->getDebugLoc();
26590 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26591 // Address into RAX/EAX
26592 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26593 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26594 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26595 for (int i = 0; i < X86::AddrNumOperands; ++i)
26596 MIB.add(MI->getOperand(i));
26598 // The instruction doesn't actually take any operands though.
26599 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26601 MI->eraseFromParent(); // The pseudo is gone now.
26607 MachineBasicBlock *
26608 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26609 MachineBasicBlock *MBB) const {
26610 // Emit va_arg instruction on X86-64.
26612 // Operands to this pseudo-instruction:
26613 // 0 ) Output : destination address (reg)
26614 // 1-5) Input : va_list address (addr, i64mem)
26615 // 6 ) ArgSize : Size (in bytes) of vararg type
26616 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26617 // 8 ) Align : Alignment of type
26618 // 9 ) EFLAGS (implicit-def)
26620 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26621 static_assert(X86::AddrNumOperands == 5,
26622 "VAARG_64 assumes 5 address operands");
26624 unsigned DestReg = MI.getOperand(0).getReg();
26625 MachineOperand &Base = MI.getOperand(1);
26626 MachineOperand &Scale = MI.getOperand(2);
26627 MachineOperand &Index = MI.getOperand(3);
26628 MachineOperand &Disp = MI.getOperand(4);
26629 MachineOperand &Segment = MI.getOperand(5);
26630 unsigned ArgSize = MI.getOperand(6).getImm();
26631 unsigned ArgMode = MI.getOperand(7).getImm();
26632 unsigned Align = MI.getOperand(8).getImm();
26634 // Memory Reference
26635 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26636 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26637 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26639 // Machine Information
26640 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26641 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26642 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26643 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26644 DebugLoc DL = MI.getDebugLoc();
26646 // struct va_list {
26649 // i64 overflow_area (address)
26650 // i64 reg_save_area (address)
26652 // sizeof(va_list) = 24
26653 // alignment(va_list) = 8
26655 unsigned TotalNumIntRegs = 6;
26656 unsigned TotalNumXMMRegs = 8;
26657 bool UseGPOffset = (ArgMode == 1);
26658 bool UseFPOffset = (ArgMode == 2);
26659 unsigned MaxOffset = TotalNumIntRegs * 8 +
26660 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26662 /* Align ArgSize to a multiple of 8 */
26663 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26664 bool NeedsAlign = (Align > 8);
26666 MachineBasicBlock *thisMBB = MBB;
26667 MachineBasicBlock *overflowMBB;
26668 MachineBasicBlock *offsetMBB;
26669 MachineBasicBlock *endMBB;
26671 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26672 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26673 unsigned OffsetReg = 0;
26675 if (!UseGPOffset && !UseFPOffset) {
26676 // If we only pull from the overflow region, we don't create a branch.
26677 // We don't need to alter control flow.
26678 OffsetDestReg = 0; // unused
26679 OverflowDestReg = DestReg;
26681 offsetMBB = nullptr;
26682 overflowMBB = thisMBB;
26685 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26686 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26687 // If not, pull from overflow_area. (branch to overflowMBB)
26692 // offsetMBB overflowMBB
26697 // Registers for the PHI in endMBB
26698 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26699 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26701 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26702 MachineFunction *MF = MBB->getParent();
26703 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26704 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26705 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26707 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26709 // Insert the new basic blocks
26710 MF->insert(MBBIter, offsetMBB);
26711 MF->insert(MBBIter, overflowMBB);
26712 MF->insert(MBBIter, endMBB);
26714 // Transfer the remainder of MBB and its successor edges to endMBB.
26715 endMBB->splice(endMBB->begin(), thisMBB,
26716 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26717 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26719 // Make offsetMBB and overflowMBB successors of thisMBB
26720 thisMBB->addSuccessor(offsetMBB);
26721 thisMBB->addSuccessor(overflowMBB);
26723 // endMBB is a successor of both offsetMBB and overflowMBB
26724 offsetMBB->addSuccessor(endMBB);
26725 overflowMBB->addSuccessor(endMBB);
26727 // Load the offset value into a register
26728 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26729 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26733 .addDisp(Disp, UseFPOffset ? 4 : 0)
26735 .setMemRefs(MMOBegin, MMOEnd);
26737 // Check if there is enough room left to pull this argument.
26738 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26740 .addImm(MaxOffset + 8 - ArgSizeA8);
26742 // Branch to "overflowMBB" if offset >= max
26743 // Fall through to "offsetMBB" otherwise
26744 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26745 .addMBB(overflowMBB);
26748 // In offsetMBB, emit code to use the reg_save_area.
26750 assert(OffsetReg != 0);
26752 // Read the reg_save_area address.
26753 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26754 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26760 .setMemRefs(MMOBegin, MMOEnd);
26762 // Zero-extend the offset
26763 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26764 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26767 .addImm(X86::sub_32bit);
26769 // Add the offset to the reg_save_area to get the final address.
26770 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26771 .addReg(OffsetReg64)
26772 .addReg(RegSaveReg);
26774 // Compute the offset for the next argument
26775 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26776 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26778 .addImm(UseFPOffset ? 16 : 8);
26780 // Store it back into the va_list.
26781 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26785 .addDisp(Disp, UseFPOffset ? 4 : 0)
26787 .addReg(NextOffsetReg)
26788 .setMemRefs(MMOBegin, MMOEnd);
26791 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26796 // Emit code to use overflow area
26799 // Load the overflow_area address into a register.
26800 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26801 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26807 .setMemRefs(MMOBegin, MMOEnd);
26809 // If we need to align it, do so. Otherwise, just copy the address
26810 // to OverflowDestReg.
26812 // Align the overflow address
26813 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26814 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26816 // aligned_addr = (addr + (align-1)) & ~(align-1)
26817 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26818 .addReg(OverflowAddrReg)
26821 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26823 .addImm(~(uint64_t)(Align-1));
26825 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26826 .addReg(OverflowAddrReg);
26829 // Compute the next overflow address after this argument.
26830 // (the overflow address should be kept 8-byte aligned)
26831 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26832 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26833 .addReg(OverflowDestReg)
26834 .addImm(ArgSizeA8);
26836 // Store the new overflow address.
26837 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26843 .addReg(NextAddrReg)
26844 .setMemRefs(MMOBegin, MMOEnd);
26846 // If we branched, emit the PHI to the front of endMBB.
26848 BuildMI(*endMBB, endMBB->begin(), DL,
26849 TII->get(X86::PHI), DestReg)
26850 .addReg(OffsetDestReg).addMBB(offsetMBB)
26851 .addReg(OverflowDestReg).addMBB(overflowMBB);
26854 // Erase the pseudo instruction
26855 MI.eraseFromParent();
26860 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26861 MachineInstr &MI, MachineBasicBlock *MBB) const {
26862 // Emit code to save XMM registers to the stack. The ABI says that the
26863 // number of registers to save is given in %al, so it's theoretically
26864 // possible to do an indirect jump trick to avoid saving all of them,
26865 // however this code takes a simpler approach and just executes all
26866 // of the stores if %al is non-zero. It's less code, and it's probably
26867 // easier on the hardware branch predictor, and stores aren't all that
26868 // expensive anyway.
26870 // Create the new basic blocks. One block contains all the XMM stores,
26871 // and one block is the final destination regardless of whether any
26872 // stores were performed.
26873 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26874 MachineFunction *F = MBB->getParent();
26875 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26876 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26877 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26878 F->insert(MBBIter, XMMSaveMBB);
26879 F->insert(MBBIter, EndMBB);
26881 // Transfer the remainder of MBB and its successor edges to EndMBB.
26882 EndMBB->splice(EndMBB->begin(), MBB,
26883 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26884 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26886 // The original block will now fall through to the XMM save block.
26887 MBB->addSuccessor(XMMSaveMBB);
26888 // The XMMSaveMBB will fall through to the end block.
26889 XMMSaveMBB->addSuccessor(EndMBB);
26891 // Now add the instructions.
26892 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26893 DebugLoc DL = MI.getDebugLoc();
26895 unsigned CountReg = MI.getOperand(0).getReg();
26896 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26897 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26899 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26900 // If %al is 0, branch around the XMM save block.
26901 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26902 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26903 MBB->addSuccessor(EndMBB);
26906 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26907 // that was just emitted, but clearly shouldn't be "saved".
26908 assert((MI.getNumOperands() <= 3 ||
26909 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26910 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26911 "Expected last argument to be EFLAGS");
26912 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26913 // In the XMM save block, save all the XMM argument registers.
26914 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26915 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26916 MachineMemOperand *MMO = F->getMachineMemOperand(
26917 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26918 MachineMemOperand::MOStore,
26919 /*Size=*/16, /*Align=*/16);
26920 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26921 .addFrameIndex(RegSaveFrameIndex)
26922 .addImm(/*Scale=*/1)
26923 .addReg(/*IndexReg=*/0)
26924 .addImm(/*Disp=*/Offset)
26925 .addReg(/*Segment=*/0)
26926 .addReg(MI.getOperand(i).getReg())
26927 .addMemOperand(MMO);
26930 MI.eraseFromParent(); // The pseudo instruction is gone now.
26935 // The EFLAGS operand of SelectItr might be missing a kill marker
26936 // because there were multiple uses of EFLAGS, and ISel didn't know
26937 // which to mark. Figure out whether SelectItr should have had a
26938 // kill marker, and set it if it should. Returns the correct kill
26940 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26941 MachineBasicBlock* BB,
26942 const TargetRegisterInfo* TRI) {
26943 // Scan forward through BB for a use/def of EFLAGS.
26944 MachineBasicBlock::iterator miI(std::next(SelectItr));
26945 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26946 const MachineInstr& mi = *miI;
26947 if (mi.readsRegister(X86::EFLAGS))
26949 if (mi.definesRegister(X86::EFLAGS))
26950 break; // Should have kill-flag - update below.
26953 // If we hit the end of the block, check whether EFLAGS is live into a
26955 if (miI == BB->end()) {
26956 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26957 sEnd = BB->succ_end();
26958 sItr != sEnd; ++sItr) {
26959 MachineBasicBlock* succ = *sItr;
26960 if (succ->isLiveIn(X86::EFLAGS))
26965 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26966 // out. SelectMI should have a kill flag on EFLAGS.
26967 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26971 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26972 // together with other CMOV pseudo-opcodes into a single basic-block with
26973 // conditional jump around it.
26974 static bool isCMOVPseudo(MachineInstr &MI) {
26975 switch (MI.getOpcode()) {
26976 case X86::CMOV_FR32:
26977 case X86::CMOV_FR64:
26978 case X86::CMOV_GR8:
26979 case X86::CMOV_GR16:
26980 case X86::CMOV_GR32:
26981 case X86::CMOV_RFP32:
26982 case X86::CMOV_RFP64:
26983 case X86::CMOV_RFP80:
26984 case X86::CMOV_V2F64:
26985 case X86::CMOV_V2I64:
26986 case X86::CMOV_V4F32:
26987 case X86::CMOV_V4F64:
26988 case X86::CMOV_V4I64:
26989 case X86::CMOV_V16F32:
26990 case X86::CMOV_V8F32:
26991 case X86::CMOV_V8F64:
26992 case X86::CMOV_V8I64:
26993 case X86::CMOV_V8I1:
26994 case X86::CMOV_V16I1:
26995 case X86::CMOV_V32I1:
26996 case X86::CMOV_V64I1:
27004 // Helper function, which inserts PHI functions into SinkMBB:
27005 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
27006 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
27007 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
27008 // the last PHI function inserted.
27009 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
27010 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
27011 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
27012 MachineBasicBlock *SinkMBB) {
27013 MachineFunction *MF = TrueMBB->getParent();
27014 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
27015 DebugLoc DL = MIItBegin->getDebugLoc();
27017 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
27018 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27020 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
27022 // As we are creating the PHIs, we have to be careful if there is more than
27023 // one. Later CMOVs may reference the results of earlier CMOVs, but later
27024 // PHIs have to reference the individual true/false inputs from earlier PHIs.
27025 // That also means that PHI construction must work forward from earlier to
27026 // later, and that the code must maintain a mapping from earlier PHI's
27027 // destination registers, and the registers that went into the PHI.
27028 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
27029 MachineInstrBuilder MIB;
27031 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
27032 unsigned DestReg = MIIt->getOperand(0).getReg();
27033 unsigned Op1Reg = MIIt->getOperand(1).getReg();
27034 unsigned Op2Reg = MIIt->getOperand(2).getReg();
27036 // If this CMOV we are generating is the opposite condition from
27037 // the jump we generated, then we have to swap the operands for the
27038 // PHI that is going to be generated.
27039 if (MIIt->getOperand(3).getImm() == OppCC)
27040 std::swap(Op1Reg, Op2Reg);
27042 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
27043 Op1Reg = RegRewriteTable[Op1Reg].first;
27045 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
27046 Op2Reg = RegRewriteTable[Op2Reg].second;
27048 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
27054 // Add this PHI to the rewrite table.
27055 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
27061 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
27062 MachineBasicBlock *
27063 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
27064 MachineInstr &SecondCascadedCMOV,
27065 MachineBasicBlock *ThisMBB) const {
27066 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27067 DebugLoc DL = FirstCMOV.getDebugLoc();
27069 // We lower cascaded CMOVs such as
27071 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
27073 // to two successive branches.
27075 // Without this, we would add a PHI between the two jumps, which ends up
27076 // creating a few copies all around. For instance, for
27078 // (sitofp (zext (fcmp une)))
27080 // we would generate:
27082 // ucomiss %xmm1, %xmm0
27083 // movss <1.0f>, %xmm0
27084 // movaps %xmm0, %xmm1
27086 // xorps %xmm1, %xmm1
27089 // movaps %xmm1, %xmm0
27093 // because this custom-inserter would have generated:
27105 // A: X = ...; Y = ...
27107 // C: Z = PHI [X, A], [Y, B]
27109 // E: PHI [X, C], [Z, D]
27111 // If we lower both CMOVs in a single step, we can instead generate:
27123 // A: X = ...; Y = ...
27125 // E: PHI [X, A], [X, C], [Y, D]
27127 // Which, in our sitofp/fcmp example, gives us something like:
27129 // ucomiss %xmm1, %xmm0
27130 // movss <1.0f>, %xmm0
27133 // xorps %xmm0, %xmm0
27138 // We lower cascaded CMOV into two successive branches to the same block.
27139 // EFLAGS is used by both, so mark it as live in the second.
27140 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27141 MachineFunction *F = ThisMBB->getParent();
27142 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27143 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27144 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27146 MachineFunction::iterator It = ++ThisMBB->getIterator();
27147 F->insert(It, FirstInsertedMBB);
27148 F->insert(It, SecondInsertedMBB);
27149 F->insert(It, SinkMBB);
27151 // For a cascaded CMOV, we lower it to two successive branches to
27152 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
27153 // the FirstInsertedMBB.
27154 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
27156 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27157 // live into the sink and copy blocks.
27158 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27159 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
27160 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
27161 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
27162 SinkMBB->addLiveIn(X86::EFLAGS);
27165 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27166 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27167 std::next(MachineBasicBlock::iterator(FirstCMOV)),
27169 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27171 // Fallthrough block for ThisMBB.
27172 ThisMBB->addSuccessor(FirstInsertedMBB);
27173 // The true block target of the first branch is always SinkMBB.
27174 ThisMBB->addSuccessor(SinkMBB);
27175 // Fallthrough block for FirstInsertedMBB.
27176 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
27177 // The true block for the branch of FirstInsertedMBB.
27178 FirstInsertedMBB->addSuccessor(SinkMBB);
27179 // This is fallthrough.
27180 SecondInsertedMBB->addSuccessor(SinkMBB);
27182 // Create the conditional branch instructions.
27183 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
27184 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
27185 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27187 X86::CondCode SecondCC =
27188 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
27189 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
27190 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
27193 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
27194 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
27195 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
27196 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
27197 MachineInstrBuilder MIB =
27198 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
27200 .addMBB(SecondInsertedMBB)
27204 // The second SecondInsertedMBB provides the same incoming value as the
27205 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
27206 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
27207 // Copy the PHI result to the register defined by the second CMOV.
27208 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
27209 TII->get(TargetOpcode::COPY),
27210 SecondCascadedCMOV.getOperand(0).getReg())
27211 .addReg(FirstCMOV.getOperand(0).getReg());
27213 // Now remove the CMOVs.
27214 FirstCMOV.eraseFromParent();
27215 SecondCascadedCMOV.eraseFromParent();
27220 MachineBasicBlock *
27221 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
27222 MachineBasicBlock *ThisMBB) const {
27223 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27224 DebugLoc DL = MI.getDebugLoc();
27226 // To "insert" a SELECT_CC instruction, we actually have to insert the
27227 // diamond control-flow pattern. The incoming instruction knows the
27228 // destination vreg to set, the condition code register to branch on, the
27229 // true/false values to select between and a branch opcode to use.
27234 // cmpTY ccX, r1, r2
27236 // fallthrough --> FalseMBB
27238 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
27239 // as described above, by inserting a BB, and then making a PHI at the join
27240 // point to select the true and false operands of the CMOV in the PHI.
27242 // The code also handles two different cases of multiple CMOV opcodes
27246 // In this case, there are multiple CMOVs in a row, all which are based on
27247 // the same condition setting (or the exact opposite condition setting).
27248 // In this case we can lower all the CMOVs using a single inserted BB, and
27249 // then make a number of PHIs at the join point to model the CMOVs. The only
27250 // trickiness here, is that in a case like:
27252 // t2 = CMOV cond1 t1, f1
27253 // t3 = CMOV cond1 t2, f2
27255 // when rewriting this into PHIs, we have to perform some renaming on the
27256 // temps since you cannot have a PHI operand refer to a PHI result earlier
27257 // in the same block. The "simple" but wrong lowering would be:
27259 // t2 = PHI t1(BB1), f1(BB2)
27260 // t3 = PHI t2(BB1), f2(BB2)
27262 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
27263 // renaming is to note that on the path through BB1, t2 is really just a
27264 // copy of t1, and do that renaming, properly generating:
27266 // t2 = PHI t1(BB1), f1(BB2)
27267 // t3 = PHI t1(BB1), f2(BB2)
27270 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
27271 // function - EmitLoweredCascadedSelect.
27273 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
27274 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27275 MachineInstr *LastCMOV = &MI;
27276 MachineBasicBlock::iterator NextMIIt =
27277 std::next(MachineBasicBlock::iterator(MI));
27279 // Check for case 1, where there are multiple CMOVs with the same condition
27280 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
27281 // number of jumps the most.
27283 if (isCMOVPseudo(MI)) {
27284 // See if we have a string of CMOVS with the same condition.
27285 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27286 (NextMIIt->getOperand(3).getImm() == CC ||
27287 NextMIIt->getOperand(3).getImm() == OppCC)) {
27288 LastCMOV = &*NextMIIt;
27293 // This checks for case 2, but only do this if we didn't already find
27294 // case 1, as indicated by LastCMOV == MI.
27295 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27296 NextMIIt->getOpcode() == MI.getOpcode() &&
27297 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27298 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27299 NextMIIt->getOperand(1).isKill()) {
27300 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27303 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27304 MachineFunction *F = ThisMBB->getParent();
27305 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27306 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27308 MachineFunction::iterator It = ++ThisMBB->getIterator();
27309 F->insert(It, FalseMBB);
27310 F->insert(It, SinkMBB);
27312 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27313 // live into the sink and copy blocks.
27314 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27315 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27316 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27317 FalseMBB->addLiveIn(X86::EFLAGS);
27318 SinkMBB->addLiveIn(X86::EFLAGS);
27321 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27322 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27323 std::next(MachineBasicBlock::iterator(LastCMOV)),
27325 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27327 // Fallthrough block for ThisMBB.
27328 ThisMBB->addSuccessor(FalseMBB);
27329 // The true block target of the first (or only) branch is always a SinkMBB.
27330 ThisMBB->addSuccessor(SinkMBB);
27331 // Fallthrough block for FalseMBB.
27332 FalseMBB->addSuccessor(SinkMBB);
27334 // Create the conditional branch instruction.
27335 unsigned Opc = X86::GetCondBranchFromCond(CC);
27336 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27339 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27341 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27342 MachineBasicBlock::iterator MIItEnd =
27343 std::next(MachineBasicBlock::iterator(LastCMOV));
27344 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27346 // Now remove the CMOV(s).
27347 ThisMBB->erase(MIItBegin, MIItEnd);
27352 MachineBasicBlock *
27353 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27354 MachineBasicBlock *BB) const {
27355 // Combine the following atomic floating-point modification pattern:
27356 // a.store(reg OP a.load(acquire), release)
27357 // Transform them into:
27358 // OPss (%gpr), %xmm
27359 // movss %xmm, (%gpr)
27360 // Or sd equivalent for 64-bit operations.
27362 switch (MI.getOpcode()) {
27363 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27364 case X86::RELEASE_FADD32mr:
27365 FOp = X86::ADDSSrm;
27366 MOp = X86::MOVSSmr;
27368 case X86::RELEASE_FADD64mr:
27369 FOp = X86::ADDSDrm;
27370 MOp = X86::MOVSDmr;
27373 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27374 DebugLoc DL = MI.getDebugLoc();
27375 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27376 unsigned ValOpIdx = X86::AddrNumOperands;
27377 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27378 MachineInstrBuilder MIB =
27379 BuildMI(*BB, MI, DL, TII->get(FOp),
27380 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27382 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27383 MachineOperand &Operand = MI.getOperand(i);
27384 // Clear any kill flags on register operands as we'll create a second
27385 // instruction using the same address operands.
27386 if (Operand.isReg())
27387 Operand.setIsKill(false);
27390 MachineInstr *FOpMI = MIB;
27391 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27392 for (int i = 0; i < X86::AddrNumOperands; ++i)
27393 MIB.add(MI.getOperand(i));
27394 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27395 MI.eraseFromParent(); // The pseudo instruction is gone now.
27399 MachineBasicBlock *
27400 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27401 MachineBasicBlock *BB) const {
27402 MachineFunction *MF = BB->getParent();
27403 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27404 DebugLoc DL = MI.getDebugLoc();
27405 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27407 assert(MF->shouldSplitStack());
27409 const bool Is64Bit = Subtarget.is64Bit();
27410 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27412 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27413 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27416 // ... [Till the alloca]
27417 // If stacklet is not large enough, jump to mallocMBB
27420 // Allocate by subtracting from RSP
27421 // Jump to continueMBB
27424 // Allocate by call to runtime
27428 // [rest of original BB]
27431 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27432 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27433 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27435 MachineRegisterInfo &MRI = MF->getRegInfo();
27436 const TargetRegisterClass *AddrRegClass =
27437 getRegClassFor(getPointerTy(MF->getDataLayout()));
27439 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27440 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27441 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27442 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27443 sizeVReg = MI.getOperand(1).getReg(),
27445 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27447 MachineFunction::iterator MBBIter = ++BB->getIterator();
27449 MF->insert(MBBIter, bumpMBB);
27450 MF->insert(MBBIter, mallocMBB);
27451 MF->insert(MBBIter, continueMBB);
27453 continueMBB->splice(continueMBB->begin(), BB,
27454 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27455 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27457 // Add code to the main basic block to check if the stack limit has been hit,
27458 // and if so, jump to mallocMBB otherwise to bumpMBB.
27459 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27460 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27461 .addReg(tmpSPVReg).addReg(sizeVReg);
27462 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27463 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27464 .addReg(SPLimitVReg);
27465 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27467 // bumpMBB simply decreases the stack pointer, since we know the current
27468 // stacklet has enough space.
27469 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27470 .addReg(SPLimitVReg);
27471 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27472 .addReg(SPLimitVReg);
27473 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27475 // Calls into a routine in libgcc to allocate more space from the heap.
27476 const uint32_t *RegMask =
27477 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27479 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27481 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27482 .addExternalSymbol("__morestack_allocate_stack_space")
27483 .addRegMask(RegMask)
27484 .addReg(X86::RDI, RegState::Implicit)
27485 .addReg(X86::RAX, RegState::ImplicitDefine);
27486 } else if (Is64Bit) {
27487 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27489 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27490 .addExternalSymbol("__morestack_allocate_stack_space")
27491 .addRegMask(RegMask)
27492 .addReg(X86::EDI, RegState::Implicit)
27493 .addReg(X86::EAX, RegState::ImplicitDefine);
27495 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27497 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27498 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27499 .addExternalSymbol("__morestack_allocate_stack_space")
27500 .addRegMask(RegMask)
27501 .addReg(X86::EAX, RegState::ImplicitDefine);
27505 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27508 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27509 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27510 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27512 // Set up the CFG correctly.
27513 BB->addSuccessor(bumpMBB);
27514 BB->addSuccessor(mallocMBB);
27515 mallocMBB->addSuccessor(continueMBB);
27516 bumpMBB->addSuccessor(continueMBB);
27518 // Take care of the PHI nodes.
27519 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27520 MI.getOperand(0).getReg())
27521 .addReg(mallocPtrVReg)
27523 .addReg(bumpSPPtrVReg)
27526 // Delete the original pseudo instruction.
27527 MI.eraseFromParent();
27530 return continueMBB;
27533 MachineBasicBlock *
27534 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27535 MachineBasicBlock *BB) const {
27536 MachineFunction *MF = BB->getParent();
27537 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27538 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27539 DebugLoc DL = MI.getDebugLoc();
27541 assert(!isAsynchronousEHPersonality(
27542 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27543 "SEH does not use catchret!");
27545 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27546 if (!Subtarget.is32Bit())
27549 // C++ EH creates a new target block to hold the restore code, and wires up
27550 // the new block to the return destination with a normal JMP_4.
27551 MachineBasicBlock *RestoreMBB =
27552 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27553 assert(BB->succ_size() == 1);
27554 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27555 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27556 BB->addSuccessor(RestoreMBB);
27557 MI.getOperand(0).setMBB(RestoreMBB);
27559 auto RestoreMBBI = RestoreMBB->begin();
27560 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27561 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27565 MachineBasicBlock *
27566 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27567 MachineBasicBlock *BB) const {
27568 MachineFunction *MF = BB->getParent();
27569 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27570 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27571 // Only 32-bit SEH requires special handling for catchpad.
27572 if (IsSEH && Subtarget.is32Bit()) {
27573 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27574 DebugLoc DL = MI.getDebugLoc();
27575 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27577 MI.eraseFromParent();
27581 MachineBasicBlock *
27582 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27583 MachineBasicBlock *BB) const {
27584 // So, here we replace TLSADDR with the sequence:
27585 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27586 // We need this because TLSADDR is lowered into calls
27587 // inside MC, therefore without the two markers shrink-wrapping
27588 // may push the prologue/epilogue pass them.
27589 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27590 DebugLoc DL = MI.getDebugLoc();
27591 MachineFunction &MF = *BB->getParent();
27593 // Emit CALLSEQ_START right before the instruction.
27594 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27595 MachineInstrBuilder CallseqStart =
27596 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27597 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27599 // Emit CALLSEQ_END right after the instruction.
27600 // We don't call erase from parent because we want to keep the
27601 // original instruction around.
27602 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27603 MachineInstrBuilder CallseqEnd =
27604 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27605 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27610 MachineBasicBlock *
27611 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27612 MachineBasicBlock *BB) const {
27613 // This is pretty easy. We're taking the value that we received from
27614 // our load from the relocation, sticking it in either RDI (x86-64)
27615 // or EAX and doing an indirect call. The return value will then
27616 // be in the normal return register.
27617 MachineFunction *F = BB->getParent();
27618 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27619 DebugLoc DL = MI.getDebugLoc();
27621 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27622 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27624 // Get a register mask for the lowered call.
27625 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27626 // proper register mask.
27627 const uint32_t *RegMask =
27628 Subtarget.is64Bit() ?
27629 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27630 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27631 if (Subtarget.is64Bit()) {
27632 MachineInstrBuilder MIB =
27633 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27637 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27638 MI.getOperand(3).getTargetFlags())
27640 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27641 addDirectMem(MIB, X86::RDI);
27642 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27643 } else if (!isPositionIndependent()) {
27644 MachineInstrBuilder MIB =
27645 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27649 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27650 MI.getOperand(3).getTargetFlags())
27652 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27653 addDirectMem(MIB, X86::EAX);
27654 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27656 MachineInstrBuilder MIB =
27657 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27658 .addReg(TII->getGlobalBaseReg(F))
27661 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27662 MI.getOperand(3).getTargetFlags())
27664 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27665 addDirectMem(MIB, X86::EAX);
27666 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27669 MI.eraseFromParent(); // The pseudo instruction is gone now.
27673 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27675 case X86::RETPOLINE_CALL32:
27676 return X86::CALLpcrel32;
27677 case X86::RETPOLINE_CALL64:
27678 return X86::CALL64pcrel32;
27679 case X86::RETPOLINE_TCRETURN32:
27680 return X86::TCRETURNdi;
27681 case X86::RETPOLINE_TCRETURN64:
27682 return X86::TCRETURNdi64;
27684 llvm_unreachable("not retpoline opcode");
27687 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27689 if (Subtarget.useRetpolineExternalThunk()) {
27690 // When using an external thunk for retpolines, we pick names that match the
27691 // names GCC happens to use as well. This helps simplify the implementation
27692 // of the thunks for kernels where they have no easy ability to create
27693 // aliases and are doing non-trivial configuration of the thunk's body. For
27694 // example, the Linux kernel will do boot-time hot patching of the thunk
27695 // bodies and cannot easily export aliases of these to loaded modules.
27697 // Note that at any point in the future, we may need to change the semantics
27698 // of how we implement retpolines and at that time will likely change the
27699 // name of the called thunk. Essentially, there is no hard guarantee that
27700 // LLVM will generate calls to specific thunks, we merely make a best-effort
27701 // attempt to help out kernels and other systems where duplicating the
27702 // thunks is costly.
27705 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27706 return "__x86_indirect_thunk_eax";
27708 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27709 return "__x86_indirect_thunk_ecx";
27711 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27712 return "__x86_indirect_thunk_edx";
27714 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27715 return "__x86_indirect_thunk_edi";
27717 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27718 return "__x86_indirect_thunk_r11";
27720 llvm_unreachable("unexpected reg for retpoline");
27723 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27726 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27727 return "__llvm_retpoline_eax";
27729 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27730 return "__llvm_retpoline_ecx";
27732 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27733 return "__llvm_retpoline_edx";
27735 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27736 return "__llvm_retpoline_edi";
27738 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27739 return "__llvm_retpoline_r11";
27741 llvm_unreachable("unexpected reg for retpoline");
27744 MachineBasicBlock *
27745 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27746 MachineBasicBlock *BB) const {
27747 // Copy the virtual register into the R11 physical register and
27748 // call the retpoline thunk.
27749 DebugLoc DL = MI.getDebugLoc();
27750 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27751 unsigned CalleeVReg = MI.getOperand(0).getReg();
27752 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27754 // Find an available scratch register to hold the callee. On 64-bit, we can
27755 // just use R11, but we scan for uses anyway to ensure we don't generate
27756 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27757 // already a register use operand to the call to hold the callee. If none
27758 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27759 // register and ESI is the base pointer to realigned stack frames with VLAs.
27760 SmallVector<unsigned, 3> AvailableRegs;
27761 if (Subtarget.is64Bit())
27762 AvailableRegs.push_back(X86::R11);
27764 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27766 // Zero out any registers that are already used.
27767 for (const auto &MO : MI.operands()) {
27768 if (MO.isReg() && MO.isUse())
27769 for (unsigned &Reg : AvailableRegs)
27770 if (Reg == MO.getReg())
27774 // Choose the first remaining non-zero available register.
27775 unsigned AvailableReg = 0;
27776 for (unsigned MaybeReg : AvailableRegs) {
27778 AvailableReg = MaybeReg;
27783 report_fatal_error("calling convention incompatible with retpoline, no "
27784 "available registers");
27786 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27788 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27789 .addReg(CalleeVReg);
27790 MI.getOperand(0).ChangeToES(Symbol);
27791 MI.setDesc(TII->get(Opc));
27792 MachineInstrBuilder(*BB->getParent(), &MI)
27793 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27797 /// SetJmp implies future control flow change upon calling the corresponding
27799 /// Instead of using the 'return' instruction, the long jump fixes the stack and
27800 /// performs an indirect branch. To do so it uses the registers that were stored
27801 /// in the jump buffer (when calling SetJmp).
27802 /// In case the shadow stack is enabled we need to fix it as well, because some
27803 /// return addresses will be skipped.
27804 /// The function will save the SSP for future fixing in the function
27805 /// emitLongJmpShadowStackFix.
27806 /// \sa emitLongJmpShadowStackFix
27807 /// \param [in] MI The temporary Machine Instruction for the builtin.
27808 /// \param [in] MBB The Machine Basic Block that will be modified.
27809 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
27810 MachineBasicBlock *MBB) const {
27811 DebugLoc DL = MI.getDebugLoc();
27812 MachineFunction *MF = MBB->getParent();
27813 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27814 MachineRegisterInfo &MRI = MF->getRegInfo();
27815 MachineInstrBuilder MIB;
27817 // Memory Reference.
27818 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27819 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27821 // Initialize a register with zero.
27822 MVT PVT = getPointerTy(MF->getDataLayout());
27823 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27824 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27825 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27826 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
27828 .addReg(ZReg, RegState::Undef)
27829 .addReg(ZReg, RegState::Undef);
27831 // Read the current SSP Register value to the zeroed register.
27832 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27833 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27834 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27836 // Write the SSP register value to offset 3 in input memory buffer.
27837 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27838 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
27839 const int64_t SSPOffset = 3 * PVT.getStoreSize();
27840 const unsigned MemOpndSlot = 1;
27841 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27842 if (i == X86::AddrDisp)
27843 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
27845 MIB.add(MI.getOperand(MemOpndSlot + i));
27847 MIB.addReg(SSPCopyReg);
27848 MIB.setMemRefs(MMOBegin, MMOEnd);
27851 MachineBasicBlock *
27852 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27853 MachineBasicBlock *MBB) const {
27854 DebugLoc DL = MI.getDebugLoc();
27855 MachineFunction *MF = MBB->getParent();
27856 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27857 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27858 MachineRegisterInfo &MRI = MF->getRegInfo();
27860 const BasicBlock *BB = MBB->getBasicBlock();
27861 MachineFunction::iterator I = ++MBB->getIterator();
27863 // Memory Reference
27864 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27865 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27868 unsigned MemOpndSlot = 0;
27870 unsigned CurOp = 0;
27872 DstReg = MI.getOperand(CurOp++).getReg();
27873 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27874 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27876 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27877 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27879 MemOpndSlot = CurOp;
27881 MVT PVT = getPointerTy(MF->getDataLayout());
27882 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27883 "Invalid Pointer Size!");
27885 // For v = setjmp(buf), we generate
27888 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27889 // SjLjSetup restoreMBB
27895 // v = phi(main, restore)
27898 // if base pointer being used, load it from frame
27901 MachineBasicBlock *thisMBB = MBB;
27902 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27903 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27904 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27905 MF->insert(I, mainMBB);
27906 MF->insert(I, sinkMBB);
27907 MF->push_back(restoreMBB);
27908 restoreMBB->setHasAddressTaken();
27910 MachineInstrBuilder MIB;
27912 // Transfer the remainder of BB and its successor edges to sinkMBB.
27913 sinkMBB->splice(sinkMBB->begin(), MBB,
27914 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27915 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27918 unsigned PtrStoreOpc = 0;
27919 unsigned LabelReg = 0;
27920 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27921 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27922 !isPositionIndependent();
27924 // Prepare IP either in reg or imm.
27925 if (!UseImmLabel) {
27926 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27927 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27928 LabelReg = MRI.createVirtualRegister(PtrRC);
27929 if (Subtarget.is64Bit()) {
27930 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27934 .addMBB(restoreMBB)
27937 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27938 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27939 .addReg(XII->getGlobalBaseReg(MF))
27942 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27946 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27948 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27949 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27950 if (i == X86::AddrDisp)
27951 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27953 MIB.add(MI.getOperand(MemOpndSlot + i));
27956 MIB.addReg(LabelReg);
27958 MIB.addMBB(restoreMBB);
27959 MIB.setMemRefs(MMOBegin, MMOEnd);
27961 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
27962 emitSetJmpShadowStackFix(MI, thisMBB);
27966 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27967 .addMBB(restoreMBB);
27969 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27970 MIB.addRegMask(RegInfo->getNoPreservedMask());
27971 thisMBB->addSuccessor(mainMBB);
27972 thisMBB->addSuccessor(restoreMBB);
27976 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27977 mainMBB->addSuccessor(sinkMBB);
27980 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27981 TII->get(X86::PHI), DstReg)
27982 .addReg(mainDstReg).addMBB(mainMBB)
27983 .addReg(restoreDstReg).addMBB(restoreMBB);
27986 if (RegInfo->hasBasePointer(*MF)) {
27987 const bool Uses64BitFramePtr =
27988 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27989 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27990 X86FI->setRestoreBasePointer(MF);
27991 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27992 unsigned BasePtr = RegInfo->getBaseRegister();
27993 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27994 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27995 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27996 .setMIFlag(MachineInstr::FrameSetup);
27998 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27999 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
28000 restoreMBB->addSuccessor(sinkMBB);
28002 MI.eraseFromParent();
28006 /// Fix the shadow stack using the previously saved SSP pointer.
28007 /// \sa emitSetJmpShadowStackFix
28008 /// \param [in] MI The temporary Machine Instruction for the builtin.
28009 /// \param [in] MBB The Machine Basic Block that will be modified.
28010 /// \return The sink MBB that will perform the future indirect branch.
28011 MachineBasicBlock *
28012 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
28013 MachineBasicBlock *MBB) const {
28014 DebugLoc DL = MI.getDebugLoc();
28015 MachineFunction *MF = MBB->getParent();
28016 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28017 MachineRegisterInfo &MRI = MF->getRegInfo();
28019 // Memory Reference
28020 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28021 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28023 MVT PVT = getPointerTy(MF->getDataLayout());
28024 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
28027 // xor vreg1, vreg1
28029 // test vreg1, vreg1
28030 // je sinkMBB # Jump if Shadow Stack is not supported
28032 // mov buf+24/12(%rip), vreg2
28033 // sub vreg1, vreg2
28034 // jbe sinkMBB # No need to fix the Shadow Stack
28037 // incssp vreg2 # fix the SSP according to the lower 8 bits
28040 // fixShadowLoopPrepareMBB:
28043 // fixShadowLoopMBB:
28046 // jne fixShadowLoopMBB # Iterate until you finish fixing
28047 // # the Shadow Stack
28050 MachineFunction::iterator I = ++MBB->getIterator();
28051 const BasicBlock *BB = MBB->getBasicBlock();
28053 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
28054 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
28055 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
28056 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
28057 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
28058 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
28059 MF->insert(I, checkSspMBB);
28060 MF->insert(I, fallMBB);
28061 MF->insert(I, fixShadowMBB);
28062 MF->insert(I, fixShadowLoopPrepareMBB);
28063 MF->insert(I, fixShadowLoopMBB);
28064 MF->insert(I, sinkMBB);
28066 // Transfer the remainder of BB and its successor edges to sinkMBB.
28067 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
28069 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
28071 MBB->addSuccessor(checkSspMBB);
28073 // Initialize a register with zero.
28074 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
28075 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
28076 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
28078 .addReg(ZReg, RegState::Undef)
28079 .addReg(ZReg, RegState::Undef);
28081 // Read the current SSP Register value to the zeroed register.
28082 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
28083 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
28084 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
28086 // Check whether the result of the SSP register is zero and jump directly
28088 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
28089 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
28090 .addReg(SSPCopyReg)
28091 .addReg(SSPCopyReg);
28092 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28093 checkSspMBB->addSuccessor(sinkMBB);
28094 checkSspMBB->addSuccessor(fallMBB);
28096 // Reload the previously saved SSP register value.
28097 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
28098 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28099 const int64_t SPPOffset = 3 * PVT.getStoreSize();
28100 MachineInstrBuilder MIB =
28101 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
28102 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28103 if (i == X86::AddrDisp)
28104 MIB.addDisp(MI.getOperand(i), SPPOffset);
28106 MIB.add(MI.getOperand(i));
28108 MIB.setMemRefs(MMOBegin, MMOEnd);
28110 // Subtract the current SSP from the previous SSP.
28111 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
28112 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
28113 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
28114 .addReg(PrevSSPReg)
28115 .addReg(SSPCopyReg);
28117 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
28118 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
28119 fallMBB->addSuccessor(sinkMBB);
28120 fallMBB->addSuccessor(fixShadowMBB);
28122 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
28123 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
28124 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
28125 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
28126 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
28130 // Increase SSP when looking only on the lower 8 bits of the delta.
28131 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
28132 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
28134 // Reset the lower 8 bits.
28135 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
28136 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
28137 .addReg(SspFirstShrReg)
28140 // Jump if the result of the shift is zero.
28141 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28142 fixShadowMBB->addSuccessor(sinkMBB);
28143 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
28145 // Do a single shift left.
28146 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
28147 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
28148 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
28149 .addReg(SspSecondShrReg);
28151 // Save the value 128 to a register (will be used next with incssp).
28152 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
28153 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
28154 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
28156 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
28158 // Since incssp only looks at the lower 8 bits, we might need to do several
28159 // iterations of incssp until we finish fixing the shadow stack.
28160 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
28161 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
28162 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
28163 .addReg(SspAfterShlReg)
28164 .addMBB(fixShadowLoopPrepareMBB)
28166 .addMBB(fixShadowLoopMBB);
28168 // Every iteration we increase the SSP by 128.
28169 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
28171 // Every iteration we decrement the counter by 1.
28172 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
28173 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
28175 // Jump if the counter is not zero yet.
28176 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
28177 fixShadowLoopMBB->addSuccessor(sinkMBB);
28178 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
28183 MachineBasicBlock *
28184 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
28185 MachineBasicBlock *MBB) const {
28186 DebugLoc DL = MI.getDebugLoc();
28187 MachineFunction *MF = MBB->getParent();
28188 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28189 MachineRegisterInfo &MRI = MF->getRegInfo();
28191 // Memory Reference
28192 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28193 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28195 MVT PVT = getPointerTy(MF->getDataLayout());
28196 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
28197 "Invalid Pointer Size!");
28199 const TargetRegisterClass *RC =
28200 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28201 unsigned Tmp = MRI.createVirtualRegister(RC);
28202 // Since FP is only updated here but NOT referenced, it's treated as GPR.
28203 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28204 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
28205 unsigned SP = RegInfo->getStackRegister();
28207 MachineInstrBuilder MIB;
28209 const int64_t LabelOffset = 1 * PVT.getStoreSize();
28210 const int64_t SPOffset = 2 * PVT.getStoreSize();
28212 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28213 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
28215 MachineBasicBlock *thisMBB = MBB;
28217 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
28218 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
28219 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
28223 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
28224 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
28225 MIB.add(MI.getOperand(i));
28226 MIB.setMemRefs(MMOBegin, MMOEnd);
28229 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
28230 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28231 if (i == X86::AddrDisp)
28232 MIB.addDisp(MI.getOperand(i), LabelOffset);
28234 MIB.add(MI.getOperand(i));
28236 MIB.setMemRefs(MMOBegin, MMOEnd);
28239 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
28240 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28241 if (i == X86::AddrDisp)
28242 MIB.addDisp(MI.getOperand(i), SPOffset);
28244 MIB.add(MI.getOperand(i));
28246 MIB.setMemRefs(MMOBegin, MMOEnd);
28249 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
28251 MI.eraseFromParent();
28255 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
28256 MachineBasicBlock *MBB,
28257 MachineBasicBlock *DispatchBB,
28259 DebugLoc DL = MI.getDebugLoc();
28260 MachineFunction *MF = MBB->getParent();
28261 MachineRegisterInfo *MRI = &MF->getRegInfo();
28262 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28264 MVT PVT = getPointerTy(MF->getDataLayout());
28265 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
28270 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
28271 !isPositionIndependent();
28274 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
28276 const TargetRegisterClass *TRC =
28277 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28278 VR = MRI->createVirtualRegister(TRC);
28279 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
28281 if (Subtarget.is64Bit())
28282 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
28286 .addMBB(DispatchBB)
28289 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
28290 .addReg(0) /* TII->getGlobalBaseReg(MF) */
28293 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
28297 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
28298 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
28300 MIB.addMBB(DispatchBB);
28305 MachineBasicBlock *
28306 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
28307 MachineBasicBlock *BB) const {
28308 DebugLoc DL = MI.getDebugLoc();
28309 MachineFunction *MF = BB->getParent();
28310 MachineFrameInfo &MFI = MF->getFrameInfo();
28311 MachineRegisterInfo *MRI = &MF->getRegInfo();
28312 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28313 int FI = MFI.getFunctionContextIndex();
28315 // Get a mapping of the call site numbers to all of the landing pads they're
28316 // associated with.
28317 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
28318 unsigned MaxCSNum = 0;
28319 for (auto &MBB : *MF) {
28320 if (!MBB.isEHPad())
28323 MCSymbol *Sym = nullptr;
28324 for (const auto &MI : MBB) {
28325 if (MI.isDebugInstr())
28328 assert(MI.isEHLabel() && "expected EH_LABEL");
28329 Sym = MI.getOperand(0).getMCSymbol();
28333 if (!MF->hasCallSiteLandingPad(Sym))
28336 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
28337 CallSiteNumToLPad[CSI].push_back(&MBB);
28338 MaxCSNum = std::max(MaxCSNum, CSI);
28342 // Get an ordered list of the machine basic blocks for the jump table.
28343 std::vector<MachineBasicBlock *> LPadList;
28344 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
28345 LPadList.reserve(CallSiteNumToLPad.size());
28347 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
28348 for (auto &LP : CallSiteNumToLPad[CSI]) {
28349 LPadList.push_back(LP);
28350 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
28354 assert(!LPadList.empty() &&
28355 "No landing pad destinations for the dispatch jump table!");
28357 // Create the MBBs for the dispatch code.
28359 // Shove the dispatch's address into the return slot in the function context.
28360 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
28361 DispatchBB->setIsEHPad(true);
28363 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
28364 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
28365 DispatchBB->addSuccessor(TrapBB);
28367 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
28368 DispatchBB->addSuccessor(DispContBB);
28371 MF->push_back(DispatchBB);
28372 MF->push_back(DispContBB);
28373 MF->push_back(TrapBB);
28375 // Insert code into the entry block that creates and registers the function
28377 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
28379 // Create the jump table and associated information
28380 unsigned JTE = getJumpTableEncoding();
28381 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
28382 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
28384 const X86RegisterInfo &RI = TII->getRegisterInfo();
28385 // Add a register mask with no preserved registers. This results in all
28386 // registers being marked as clobbered.
28387 if (RI.hasBasePointer(*MF)) {
28388 const bool FPIs64Bit =
28389 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
28390 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
28391 MFI->setRestoreBasePointer(MF);
28393 unsigned FP = RI.getFrameRegister(*MF);
28394 unsigned BP = RI.getBaseRegister();
28395 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
28396 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
28397 MFI->getRestoreBasePointerOffset())
28398 .addRegMask(RI.getNoPreservedMask());
28400 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
28401 .addRegMask(RI.getNoPreservedMask());
28404 // IReg is used as an index in a memory operand and therefore can't be SP
28405 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
28406 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
28407 Subtarget.is64Bit() ? 8 : 4);
28408 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
28410 .addImm(LPadList.size());
28411 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
28413 if (Subtarget.is64Bit()) {
28414 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28415 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
28417 // leaq .LJTI0_0(%rip), BReg
28418 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
28422 .addJumpTableIndex(MJTI)
28424 // movzx IReg64, IReg
28425 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
28428 .addImm(X86::sub_32bit);
28431 case MachineJumpTableInfo::EK_BlockAddress:
28432 // jmpq *(BReg,IReg64,8)
28433 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
28440 case MachineJumpTableInfo::EK_LabelDifference32: {
28441 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
28442 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
28443 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28445 // movl (BReg,IReg64,4), OReg
28446 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
28452 // movsx OReg64, OReg
28453 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
28454 // addq BReg, OReg64, TReg
28455 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
28459 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
28463 llvm_unreachable("Unexpected jump table encoding");
28466 // jmpl *.LJTI0_0(,IReg,4)
28467 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
28471 .addJumpTableIndex(MJTI)
28475 // Add the jump table entries as successors to the MBB.
28476 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
28477 for (auto &LP : LPadList)
28478 if (SeenMBBs.insert(LP).second)
28479 DispContBB->addSuccessor(LP);
28481 // N.B. the order the invoke BBs are processed in doesn't matter here.
28482 SmallVector<MachineBasicBlock *, 64> MBBLPads;
28483 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
28484 for (MachineBasicBlock *MBB : InvokeBBs) {
28485 // Remove the landing pad successor from the invoke block and replace it
28486 // with the new dispatch block.
28487 // Keep a copy of Successors since it's modified inside the loop.
28488 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
28490 // FIXME: Avoid quadratic complexity.
28491 for (auto MBBS : Successors) {
28492 if (MBBS->isEHPad()) {
28493 MBB->removeSuccessor(MBBS);
28494 MBBLPads.push_back(MBBS);
28498 MBB->addSuccessor(DispatchBB);
28500 // Find the invoke call and mark all of the callee-saved registers as
28501 // 'implicit defined' so that they're spilled. This prevents code from
28502 // moving instructions to before the EH block, where they will never be
28504 for (auto &II : reverse(*MBB)) {
28508 DenseMap<unsigned, bool> DefRegs;
28509 for (auto &MOp : II.operands())
28511 DefRegs[MOp.getReg()] = true;
28513 MachineInstrBuilder MIB(*MF, &II);
28514 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
28515 unsigned Reg = SavedRegs[RI];
28517 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
28524 // Mark all former landing pads as non-landing pads. The dispatch is the only
28525 // landing pad now.
28526 for (auto &LP : MBBLPads)
28527 LP->setIsEHPad(false);
28529 // The instruction is gone now.
28530 MI.eraseFromParent();
28534 MachineBasicBlock *
28535 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28536 MachineBasicBlock *BB) const {
28537 MachineFunction *MF = BB->getParent();
28538 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28539 DebugLoc DL = MI.getDebugLoc();
28541 switch (MI.getOpcode()) {
28542 default: llvm_unreachable("Unexpected instr type to insert");
28543 case X86::TLS_addr32:
28544 case X86::TLS_addr64:
28545 case X86::TLS_base_addr32:
28546 case X86::TLS_base_addr64:
28547 return EmitLoweredTLSAddr(MI, BB);
28548 case X86::RETPOLINE_CALL32:
28549 case X86::RETPOLINE_CALL64:
28550 case X86::RETPOLINE_TCRETURN32:
28551 case X86::RETPOLINE_TCRETURN64:
28552 return EmitLoweredRetpoline(MI, BB);
28553 case X86::CATCHRET:
28554 return EmitLoweredCatchRet(MI, BB);
28555 case X86::CATCHPAD:
28556 return EmitLoweredCatchPad(MI, BB);
28557 case X86::SEG_ALLOCA_32:
28558 case X86::SEG_ALLOCA_64:
28559 return EmitLoweredSegAlloca(MI, BB);
28560 case X86::TLSCall_32:
28561 case X86::TLSCall_64:
28562 return EmitLoweredTLSCall(MI, BB);
28563 case X86::CMOV_FR32:
28564 case X86::CMOV_FR64:
28565 case X86::CMOV_F128:
28566 case X86::CMOV_GR8:
28567 case X86::CMOV_GR16:
28568 case X86::CMOV_GR32:
28569 case X86::CMOV_RFP32:
28570 case X86::CMOV_RFP64:
28571 case X86::CMOV_RFP80:
28572 case X86::CMOV_V2F64:
28573 case X86::CMOV_V2I64:
28574 case X86::CMOV_V4F32:
28575 case X86::CMOV_V4F64:
28576 case X86::CMOV_V4I64:
28577 case X86::CMOV_V16F32:
28578 case X86::CMOV_V8F32:
28579 case X86::CMOV_V8F64:
28580 case X86::CMOV_V8I64:
28581 case X86::CMOV_V8I1:
28582 case X86::CMOV_V16I1:
28583 case X86::CMOV_V32I1:
28584 case X86::CMOV_V64I1:
28585 return EmitLoweredSelect(MI, BB);
28587 case X86::RDFLAGS32:
28588 case X86::RDFLAGS64: {
28590 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28591 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28592 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28593 // Permit reads of the EFLAGS and DF registers without them being defined.
28594 // This intrinsic exists to read external processor state in flags, such as
28595 // the trap flag, interrupt flag, and direction flag, none of which are
28596 // modeled by the backend.
28597 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
28598 "Unexpected register in operand!");
28599 Push->getOperand(2).setIsUndef();
28600 assert(Push->getOperand(3).getReg() == X86::DF &&
28601 "Unexpected register in operand!");
28602 Push->getOperand(3).setIsUndef();
28603 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28605 MI.eraseFromParent(); // The pseudo is gone now.
28609 case X86::WRFLAGS32:
28610 case X86::WRFLAGS64: {
28612 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28614 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28615 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28616 BuildMI(*BB, MI, DL, TII->get(PopF));
28618 MI.eraseFromParent(); // The pseudo is gone now.
28622 case X86::RELEASE_FADD32mr:
28623 case X86::RELEASE_FADD64mr:
28624 return EmitLoweredAtomicFP(MI, BB);
28626 case X86::FP32_TO_INT16_IN_MEM:
28627 case X86::FP32_TO_INT32_IN_MEM:
28628 case X86::FP32_TO_INT64_IN_MEM:
28629 case X86::FP64_TO_INT16_IN_MEM:
28630 case X86::FP64_TO_INT32_IN_MEM:
28631 case X86::FP64_TO_INT64_IN_MEM:
28632 case X86::FP80_TO_INT16_IN_MEM:
28633 case X86::FP80_TO_INT32_IN_MEM:
28634 case X86::FP80_TO_INT64_IN_MEM: {
28635 // Change the floating point control register to use "round towards zero"
28636 // mode when truncating to an integer value.
28637 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28638 addFrameReference(BuildMI(*BB, MI, DL,
28639 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28641 // Load the old value of the high byte of the control word...
28643 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28644 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28647 // Set the high part to be round to zero...
28648 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28651 // Reload the modified control word now...
28652 addFrameReference(BuildMI(*BB, MI, DL,
28653 TII->get(X86::FLDCW16m)), CWFrameIdx);
28655 // Restore the memory image of control word to original value
28656 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28659 // Get the X86 opcode to use.
28661 switch (MI.getOpcode()) {
28662 default: llvm_unreachable("illegal opcode!");
28663 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28664 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28665 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28666 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28667 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28668 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28669 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28670 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28671 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28674 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28675 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28676 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28678 // Reload the original control word now.
28679 addFrameReference(BuildMI(*BB, MI, DL,
28680 TII->get(X86::FLDCW16m)), CWFrameIdx);
28682 MI.eraseFromParent(); // The pseudo instruction is gone now.
28685 // Thread synchronization.
28687 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28688 case X86::MONITORX:
28689 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28693 return emitClzero(&MI, BB, Subtarget);
28697 return emitWRPKRU(MI, BB, Subtarget);
28699 return emitRDPKRU(MI, BB, Subtarget);
28702 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28704 case X86::VASTART_SAVE_XMM_REGS:
28705 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28707 case X86::VAARG_64:
28708 return EmitVAARG64WithCustomInserter(MI, BB);
28710 case X86::EH_SjLj_SetJmp32:
28711 case X86::EH_SjLj_SetJmp64:
28712 return emitEHSjLjSetJmp(MI, BB);
28714 case X86::EH_SjLj_LongJmp32:
28715 case X86::EH_SjLj_LongJmp64:
28716 return emitEHSjLjLongJmp(MI, BB);
28718 case X86::Int_eh_sjlj_setup_dispatch:
28719 return EmitSjLjDispatchBlock(MI, BB);
28721 case TargetOpcode::STATEPOINT:
28722 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28723 // this point in the process. We diverge later.
28724 return emitPatchPoint(MI, BB);
28726 case TargetOpcode::STACKMAP:
28727 case TargetOpcode::PATCHPOINT:
28728 return emitPatchPoint(MI, BB);
28730 case TargetOpcode::PATCHABLE_EVENT_CALL:
28731 return emitXRayCustomEvent(MI, BB);
28733 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
28734 return emitXRayTypedEvent(MI, BB);
28736 case X86::LCMPXCHG8B: {
28737 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28738 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28739 // requires a memory operand. If it happens that current architecture is
28740 // i686 and for current function we need a base pointer
28741 // - which is ESI for i686 - register allocator would not be able to
28742 // allocate registers for an address in form of X(%reg, %reg, Y)
28743 // - there never would be enough unreserved registers during regalloc
28744 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28745 // We are giving a hand to register allocator by precomputing the address in
28746 // a new vreg using LEA.
28748 // If it is not i686 or there is no base pointer - nothing to do here.
28749 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28752 // Even though this code does not necessarily needs the base pointer to
28753 // be ESI, we check for that. The reason: if this assert fails, there are
28754 // some changes happened in the compiler base pointer handling, which most
28755 // probably have to be addressed somehow here.
28756 assert(TRI->getBaseRegister() == X86::ESI &&
28757 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28758 "base pointer in mind");
28760 MachineRegisterInfo &MRI = MF->getRegInfo();
28761 MVT SPTy = getPointerTy(MF->getDataLayout());
28762 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28763 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28765 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28766 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28767 // does not use index register.
28768 if (AM.IndexReg == X86::NoRegister)
28771 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28772 // four operand definitions that are E[ABCD] registers. We skip them and
28773 // then insert the LEA.
28774 MachineBasicBlock::iterator MBBI(MI);
28775 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28776 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28779 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28781 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28785 case X86::LCMPXCHG16B:
28787 case X86::LCMPXCHG8B_SAVE_EBX:
28788 case X86::LCMPXCHG16B_SAVE_RBX: {
28790 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28791 if (!BB->isLiveIn(BasePtr))
28792 BB->addLiveIn(BasePtr);
28798 //===----------------------------------------------------------------------===//
28799 // X86 Optimization Hooks
28800 //===----------------------------------------------------------------------===//
28803 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28804 const APInt &Demanded,
28805 TargetLoweringOpt &TLO) const {
28806 // Only optimize Ands to prevent shrinking a constant that could be
28807 // matched by movzx.
28808 if (Op.getOpcode() != ISD::AND)
28811 EVT VT = Op.getValueType();
28817 unsigned Size = VT.getSizeInBits();
28819 // Make sure the RHS really is a constant.
28820 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28824 const APInt &Mask = C->getAPIntValue();
28826 // Clear all non-demanded bits initially.
28827 APInt ShrunkMask = Mask & Demanded;
28829 // Find the width of the shrunk mask.
28830 unsigned Width = ShrunkMask.getActiveBits();
28832 // If the mask is all 0s there's nothing to do here.
28836 // Find the next power of 2 width, rounding up to a byte.
28837 Width = PowerOf2Ceil(std::max(Width, 8U));
28838 // Truncate the width to size to handle illegal types.
28839 Width = std::min(Width, Size);
28841 // Calculate a possible zero extend mask for this constant.
28842 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28844 // If we aren't changing the mask, just return true to keep it and prevent
28845 // the caller from optimizing.
28846 if (ZeroExtendMask == Mask)
28849 // Make sure the new mask can be represented by a combination of mask bits
28850 // and non-demanded bits.
28851 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28854 // Replace the constant with the zero extend mask.
28856 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28857 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28858 return TLO.CombineTo(Op, NewOp);
28861 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28863 const APInt &DemandedElts,
28864 const SelectionDAG &DAG,
28865 unsigned Depth) const {
28866 unsigned BitWidth = Known.getBitWidth();
28867 unsigned Opc = Op.getOpcode();
28868 EVT VT = Op.getValueType();
28869 assert((Opc >= ISD::BUILTIN_OP_END ||
28870 Opc == ISD::INTRINSIC_WO_CHAIN ||
28871 Opc == ISD::INTRINSIC_W_CHAIN ||
28872 Opc == ISD::INTRINSIC_VOID) &&
28873 "Should use MaskedValueIsZero if you don't know whether Op"
28874 " is a target node!");
28879 case X86ISD::SETCC:
28880 Known.Zero.setBitsFrom(1);
28882 case X86ISD::MOVMSK: {
28883 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28884 Known.Zero.setBitsFrom(NumLoBits);
28887 case X86ISD::PEXTRB:
28888 case X86ISD::PEXTRW: {
28889 SDValue Src = Op.getOperand(0);
28890 EVT SrcVT = Src.getValueType();
28891 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28892 Op.getConstantOperandVal(1));
28893 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28894 Known = Known.zextOrTrunc(BitWidth);
28895 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28898 case X86ISD::VSHLI:
28899 case X86ISD::VSRLI: {
28900 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28901 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28902 Known.setAllZero();
28906 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28907 unsigned ShAmt = ShiftImm->getZExtValue();
28908 if (Opc == X86ISD::VSHLI) {
28909 Known.Zero <<= ShAmt;
28910 Known.One <<= ShAmt;
28911 // Low bits are known zero.
28912 Known.Zero.setLowBits(ShAmt);
28914 Known.Zero.lshrInPlace(ShAmt);
28915 Known.One.lshrInPlace(ShAmt);
28916 // High bits are known zero.
28917 Known.Zero.setHighBits(ShAmt);
28922 case X86ISD::PACKUS: {
28923 // PACKUS is just a truncation if the upper half is zero.
28924 // TODO: Add DemandedElts support.
28926 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
28927 DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
28928 Known.One &= Known2.One;
28929 Known.Zero &= Known2.Zero;
28930 if (Known.countMinLeadingZeros() < BitWidth)
28932 Known = Known.trunc(BitWidth);
28935 case X86ISD::VZEXT: {
28936 // TODO: Add DemandedElts support.
28937 SDValue N0 = Op.getOperand(0);
28938 unsigned NumElts = VT.getVectorNumElements();
28940 EVT SrcVT = N0.getValueType();
28941 unsigned InNumElts = SrcVT.getVectorNumElements();
28942 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28943 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28945 Known = KnownBits(InBitWidth);
28946 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28947 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28948 Known = Known.zext(BitWidth);
28949 Known.Zero.setBitsFrom(InBitWidth);
28952 case X86ISD::CMOV: {
28953 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28954 // If we don't know any bits, early out.
28955 if (Known.isUnknown())
28958 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28960 // Only known if known in both the LHS and RHS.
28961 Known.One &= Known2.One;
28962 Known.Zero &= Known2.Zero;
28965 case X86ISD::UDIVREM8_ZEXT_HREG:
28966 // TODO: Support more than just the zero extended bits?
28967 if (Op.getResNo() != 1)
28969 // The remainder is zero extended.
28970 Known.Zero.setBitsFrom(8);
28974 // Handle target shuffles.
28975 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
28976 if (isTargetShuffle(Opc)) {
28978 SmallVector<int, 64> Mask;
28979 SmallVector<SDValue, 2> Ops;
28980 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
28982 unsigned NumOps = Ops.size();
28983 unsigned NumElts = VT.getVectorNumElements();
28984 if (Mask.size() == NumElts) {
28985 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
28986 Known.Zero.setAllBits(); Known.One.setAllBits();
28987 for (unsigned i = 0; i != NumElts; ++i) {
28988 if (!DemandedElts[i])
28991 if (M == SM_SentinelUndef) {
28992 // For UNDEF elements, we don't know anything about the common state
28993 // of the shuffle result.
28996 } else if (M == SM_SentinelZero) {
28997 Known.One.clearAllBits();
29000 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
29001 "Shuffle index out of range");
29003 unsigned OpIdx = (unsigned)M / NumElts;
29004 unsigned EltIdx = (unsigned)M % NumElts;
29005 if (Ops[OpIdx].getValueType() != VT) {
29006 // TODO - handle target shuffle ops with different value types.
29010 DemandedOps[OpIdx].setBit(EltIdx);
29012 // Known bits are the values that are shared by every demanded element.
29013 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
29014 if (!DemandedOps[i])
29017 DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
29018 Known.One &= Known2.One;
29019 Known.Zero &= Known2.Zero;
29026 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
29027 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
29028 unsigned Depth) const {
29029 unsigned VTBits = Op.getScalarValueSizeInBits();
29030 unsigned Opcode = Op.getOpcode();
29032 case X86ISD::SETCC_CARRY:
29033 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
29036 case X86ISD::VSEXT: {
29037 // TODO: Add DemandedElts support.
29038 SDValue Src = Op.getOperand(0);
29039 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29040 Tmp += VTBits - Src.getScalarValueSizeInBits();
29044 case X86ISD::VTRUNC: {
29045 // TODO: Add DemandedElts support.
29046 SDValue Src = Op.getOperand(0);
29047 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
29048 assert(VTBits < NumSrcBits && "Illegal truncation input type");
29049 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29050 if (Tmp > (NumSrcBits - VTBits))
29051 return Tmp - (NumSrcBits - VTBits);
29055 case X86ISD::PACKSS: {
29056 // PACKSS is just a truncation if the sign bits extend to the packed size.
29057 // TODO: Add DemandedElts support.
29058 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
29059 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
29060 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
29061 unsigned Tmp = std::min(Tmp0, Tmp1);
29062 if (Tmp > (SrcBits - VTBits))
29063 return Tmp - (SrcBits - VTBits);
29067 case X86ISD::VSHLI: {
29068 SDValue Src = Op.getOperand(0);
29069 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29070 if (ShiftVal.uge(VTBits))
29071 return VTBits; // Shifted all bits out --> zero.
29072 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29073 if (ShiftVal.uge(Tmp))
29074 return 1; // Shifted all sign bits out --> unknown.
29075 return Tmp - ShiftVal.getZExtValue();
29078 case X86ISD::VSRAI: {
29079 SDValue Src = Op.getOperand(0);
29080 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29081 if (ShiftVal.uge(VTBits - 1))
29082 return VTBits; // Sign splat.
29083 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29085 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
29088 case X86ISD::PCMPGT:
29089 case X86ISD::PCMPEQ:
29091 case X86ISD::VPCOM:
29092 case X86ISD::VPCOMU:
29093 // Vector compares return zero/all-bits result values.
29096 case X86ISD::CMOV: {
29097 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
29098 if (Tmp0 == 1) return 1; // Early out.
29099 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
29100 return std::min(Tmp0, Tmp1);
29102 case X86ISD::SDIVREM8_SEXT_HREG:
29103 // TODO: Support more than just the sign extended bits?
29104 if (Op.getResNo() != 1)
29106 // The remainder is sign extended.
29114 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
29115 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
29116 return N->getOperand(0);
29120 /// Returns true (and the GlobalValue and the offset) if the node is a
29121 /// GlobalAddress + offset.
29122 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
29123 const GlobalValue* &GA,
29124 int64_t &Offset) const {
29125 if (N->getOpcode() == X86ISD::Wrapper) {
29126 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
29127 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
29128 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
29132 return TargetLowering::isGAPlusOffset(N, GA, Offset);
29135 // Attempt to match a combined shuffle mask against supported unary shuffle
29137 // TODO: Investigate sharing more of this with shuffle lowering.
29138 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29139 bool AllowFloatDomain, bool AllowIntDomain,
29140 SDValue &V1, const SDLoc &DL,
29142 const X86Subtarget &Subtarget,
29143 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
29144 unsigned NumMaskElts = Mask.size();
29145 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
29147 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
29148 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
29149 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
29150 Shuffle = X86ISD::VZEXT_MOVL;
29151 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29155 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
29156 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
29157 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
29158 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
29159 unsigned MaxScale = 64 / MaskEltSize;
29160 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
29162 unsigned NumDstElts = NumMaskElts / Scale;
29163 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
29164 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
29165 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
29168 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
29169 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
29170 MVT::getIntegerVT(MaskEltSize);
29171 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
29173 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
29174 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
29175 Shuffle = unsigned(X86ISD::VZEXT);
29177 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
29179 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
29180 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
29186 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
29187 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
29188 isUndefOrEqual(Mask[0], 0) &&
29189 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
29190 Shuffle = X86ISD::VZEXT_MOVL;
29191 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29195 // Check if we have SSE3 which will let us use MOVDDUP etc. The
29196 // instructions are no slower than UNPCKLPD but has the option to
29197 // fold the input operand into even an unaligned memory load.
29198 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
29199 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
29200 Shuffle = X86ISD::MOVDDUP;
29201 SrcVT = DstVT = MVT::v2f64;
29204 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29205 Shuffle = X86ISD::MOVSLDUP;
29206 SrcVT = DstVT = MVT::v4f32;
29209 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
29210 Shuffle = X86ISD::MOVSHDUP;
29211 SrcVT = DstVT = MVT::v4f32;
29216 if (MaskVT.is256BitVector() && AllowFloatDomain) {
29217 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
29218 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29219 Shuffle = X86ISD::MOVDDUP;
29220 SrcVT = DstVT = MVT::v4f64;
29223 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29224 Shuffle = X86ISD::MOVSLDUP;
29225 SrcVT = DstVT = MVT::v8f32;
29228 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
29229 Shuffle = X86ISD::MOVSHDUP;
29230 SrcVT = DstVT = MVT::v8f32;
29235 if (MaskVT.is512BitVector() && AllowFloatDomain) {
29236 assert(Subtarget.hasAVX512() &&
29237 "AVX512 required for 512-bit vector shuffles");
29238 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29239 Shuffle = X86ISD::MOVDDUP;
29240 SrcVT = DstVT = MVT::v8f64;
29243 if (isTargetShuffleEquivalent(
29244 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
29245 Shuffle = X86ISD::MOVSLDUP;
29246 SrcVT = DstVT = MVT::v16f32;
29249 if (isTargetShuffleEquivalent(
29250 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
29251 Shuffle = X86ISD::MOVSHDUP;
29252 SrcVT = DstVT = MVT::v16f32;
29257 // Attempt to match against broadcast-from-vector.
29258 if (Subtarget.hasAVX2()) {
29259 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
29260 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
29261 SrcVT = DstVT = MaskVT;
29262 Shuffle = X86ISD::VBROADCAST;
29270 // Attempt to match a combined shuffle mask against supported unary immediate
29271 // permute instructions.
29272 // TODO: Investigate sharing more of this with shuffle lowering.
29273 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29274 const APInt &Zeroable,
29275 bool AllowFloatDomain,
29276 bool AllowIntDomain,
29277 const X86Subtarget &Subtarget,
29278 unsigned &Shuffle, MVT &ShuffleVT,
29279 unsigned &PermuteImm) {
29280 unsigned NumMaskElts = Mask.size();
29281 unsigned InputSizeInBits = MaskVT.getSizeInBits();
29282 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
29283 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
29285 bool ContainsZeros =
29286 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29288 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
29289 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
29290 // Check for lane crossing permutes.
29291 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
29292 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
29293 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
29294 Shuffle = X86ISD::VPERMI;
29295 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
29296 PermuteImm = getV4X86ShuffleImm(Mask);
29299 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
29300 SmallVector<int, 4> RepeatedMask;
29301 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
29302 Shuffle = X86ISD::VPERMI;
29303 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
29304 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
29308 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
29309 // VPERMILPD can permute with a non-repeating shuffle.
29310 Shuffle = X86ISD::VPERMILPI;
29311 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
29313 for (int i = 0, e = Mask.size(); i != e; ++i) {
29315 if (M == SM_SentinelUndef)
29317 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
29318 PermuteImm |= (M & 1) << i;
29324 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
29325 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
29326 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
29327 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
29328 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
29329 SmallVector<int, 4> RepeatedMask;
29330 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29331 // Narrow the repeated mask to create 32-bit element permutes.
29332 SmallVector<int, 4> WordMask = RepeatedMask;
29333 if (MaskScalarSizeInBits == 64)
29334 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
29336 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
29337 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
29338 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
29339 PermuteImm = getV4X86ShuffleImm(WordMask);
29344 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
29345 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
29346 SmallVector<int, 4> RepeatedMask;
29347 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29348 ArrayRef<int> LoMask(Mask.data() + 0, 4);
29349 ArrayRef<int> HiMask(Mask.data() + 4, 4);
29351 // PSHUFLW: permute lower 4 elements only.
29352 if (isUndefOrInRange(LoMask, 0, 4) &&
29353 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
29354 Shuffle = X86ISD::PSHUFLW;
29355 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29356 PermuteImm = getV4X86ShuffleImm(LoMask);
29360 // PSHUFHW: permute upper 4 elements only.
29361 if (isUndefOrInRange(HiMask, 4, 8) &&
29362 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
29363 // Offset the HiMask so that we can create the shuffle immediate.
29364 int OffsetHiMask[4];
29365 for (int i = 0; i != 4; ++i)
29366 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
29368 Shuffle = X86ISD::PSHUFHW;
29369 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29370 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
29376 // Attempt to match against byte/bit shifts.
29377 // FIXME: Add 512-bit support.
29378 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29379 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29380 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
29381 MaskScalarSizeInBits, Mask,
29382 0, Zeroable, Subtarget);
29383 if (0 < ShiftAmt) {
29384 PermuteImm = (unsigned)ShiftAmt;
29392 // Attempt to match a combined unary shuffle mask against supported binary
29393 // shuffle instructions.
29394 // TODO: Investigate sharing more of this with shuffle lowering.
29395 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29396 bool AllowFloatDomain, bool AllowIntDomain,
29397 SDValue &V1, SDValue &V2, const SDLoc &DL,
29399 const X86Subtarget &Subtarget,
29400 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
29402 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29404 if (MaskVT.is128BitVector()) {
29405 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
29407 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
29408 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
29409 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29412 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
29414 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
29415 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
29418 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
29419 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29421 Shuffle = X86ISD::MOVSD;
29422 SrcVT = DstVT = MVT::v2f64;
29425 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
29426 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29427 Shuffle = X86ISD::MOVSS;
29428 SrcVT = DstVT = MVT::v4f32;
29433 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
29434 // TODO add support for 256/512-bit types.
29435 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
29436 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
29443 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
29444 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
29445 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29446 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
29447 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
29448 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
29449 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
29451 SrcVT = DstVT = MaskVT;
29452 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
29453 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
29461 static bool matchBinaryPermuteVectorShuffle(
29462 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
29463 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
29464 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
29465 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
29466 unsigned NumMaskElts = Mask.size();
29467 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29469 // Attempt to match against PALIGNR byte rotate.
29470 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29471 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29472 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
29473 if (0 < ByteRotation) {
29474 Shuffle = X86ISD::PALIGNR;
29475 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
29476 PermuteImm = ByteRotation;
29481 // Attempt to combine to X86ISD::BLENDI.
29482 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
29483 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
29484 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
29485 uint64_t BlendMask = 0;
29486 bool ForceV1Zero = false, ForceV2Zero = false;
29487 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
29488 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
29490 if (MaskVT == MVT::v16i16) {
29491 // We can only use v16i16 PBLENDW if the lanes are repeated.
29492 SmallVector<int, 8> RepeatedMask;
29493 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
29495 assert(RepeatedMask.size() == 8 &&
29496 "Repeated mask size doesn't match!");
29498 for (int i = 0; i < 8; ++i)
29499 if (RepeatedMask[i] >= 8)
29500 PermuteImm |= 1 << i;
29501 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29502 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29503 Shuffle = X86ISD::BLENDI;
29504 ShuffleVT = MaskVT;
29508 // Determine a type compatible with X86ISD::BLENDI.
29509 ShuffleVT = MaskVT;
29510 if (Subtarget.hasAVX2()) {
29511 if (ShuffleVT == MVT::v4i64)
29512 ShuffleVT = MVT::v8i32;
29513 else if (ShuffleVT == MVT::v2i64)
29514 ShuffleVT = MVT::v4i32;
29516 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
29517 ShuffleVT = MVT::v8i16;
29518 else if (ShuffleVT == MVT::v4i64)
29519 ShuffleVT = MVT::v4f64;
29520 else if (ShuffleVT == MVT::v8i32)
29521 ShuffleVT = MVT::v8f32;
29524 if (!ShuffleVT.isFloatingPoint()) {
29525 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
29527 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
29528 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
29529 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
29532 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29533 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29534 PermuteImm = (unsigned)BlendMask;
29535 Shuffle = X86ISD::BLENDI;
29541 // Attempt to combine to INSERTPS.
29542 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
29543 MaskVT.is128BitVector()) {
29544 if (Zeroable.getBoolValue() &&
29545 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
29546 Shuffle = X86ISD::INSERTPS;
29547 ShuffleVT = MVT::v4f32;
29552 // Attempt to combine to SHUFPD.
29553 if (AllowFloatDomain && EltSizeInBits == 64 &&
29554 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29555 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29556 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29557 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
29558 Shuffle = X86ISD::SHUFP;
29559 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
29564 // Attempt to combine to SHUFPS.
29565 if (AllowFloatDomain && EltSizeInBits == 32 &&
29566 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
29567 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29568 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29569 SmallVector<int, 4> RepeatedMask;
29570 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
29571 // Match each half of the repeated mask, to determine if its just
29572 // referencing one of the vectors, is zeroable or entirely undef.
29573 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
29574 int M0 = RepeatedMask[Offset];
29575 int M1 = RepeatedMask[Offset + 1];
29577 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29578 return DAG.getUNDEF(MaskVT);
29579 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29580 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29581 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29582 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29583 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29584 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29585 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29587 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29588 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29589 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29596 int ShufMask[4] = {-1, -1, -1, -1};
29597 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29598 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29603 Shuffle = X86ISD::SHUFP;
29604 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29605 PermuteImm = getV4X86ShuffleImm(ShufMask);
29614 /// Combine an arbitrary chain of shuffles into a single instruction if
29617 /// This is the leaf of the recursive combine below. When we have found some
29618 /// chain of single-use x86 shuffle instructions and accumulated the combined
29619 /// shuffle mask represented by them, this will try to pattern match that mask
29620 /// into either a single instruction if there is a special purpose instruction
29621 /// for this operation, or into a PSHUFB instruction which is a fully general
29622 /// instruction but should only be used to replace chains over a certain depth.
29623 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29624 ArrayRef<int> BaseMask, int Depth,
29625 bool HasVariableMask, SelectionDAG &DAG,
29626 const X86Subtarget &Subtarget) {
29627 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29628 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29629 "Unexpected number of shuffle inputs!");
29631 // Find the inputs that enter the chain. Note that multiple uses are OK
29632 // here, we're not going to remove the operands we find.
29633 bool UnaryShuffle = (Inputs.size() == 1);
29634 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29635 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29636 : peekThroughBitcasts(Inputs[1]));
29638 MVT VT1 = V1.getSimpleValueType();
29639 MVT VT2 = V2.getSimpleValueType();
29640 MVT RootVT = Root.getSimpleValueType();
29641 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29642 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29643 "Vector size mismatch");
29648 unsigned NumBaseMaskElts = BaseMask.size();
29649 if (NumBaseMaskElts == 1) {
29650 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29651 return DAG.getBitcast(RootVT, V1);
29654 unsigned RootSizeInBits = RootVT.getSizeInBits();
29655 unsigned NumRootElts = RootVT.getVectorNumElements();
29656 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29657 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29658 (RootVT.isFloatingPoint() && Depth >= 2) ||
29659 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29661 // Don't combine if we are a AVX512/EVEX target and the mask element size
29662 // is different from the root element size - this would prevent writemasks
29663 // from being reused.
29664 // TODO - this currently prevents all lane shuffles from occurring.
29665 // TODO - check for writemasks usage instead of always preventing combining.
29666 // TODO - attempt to narrow Mask back to writemask size.
29667 bool IsEVEXShuffle =
29668 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29670 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29672 // Handle 128-bit lane shuffles of 256-bit vectors.
29673 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29674 // we need to use the zeroing feature.
29675 // TODO - this should support binary shuffles.
29676 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29677 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29678 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29679 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29680 return SDValue(); // Nothing to do!
29681 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29682 unsigned PermMask = 0;
29683 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29684 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29686 Res = DAG.getBitcast(ShuffleVT, V1);
29687 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29688 DAG.getUNDEF(ShuffleVT),
29689 DAG.getConstant(PermMask, DL, MVT::i8));
29690 return DAG.getBitcast(RootVT, Res);
29693 // For masks that have been widened to 128-bit elements or more,
29694 // narrow back down to 64-bit elements.
29695 SmallVector<int, 64> Mask;
29696 if (BaseMaskEltSizeInBits > 64) {
29697 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29698 int MaskScale = BaseMaskEltSizeInBits / 64;
29699 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29701 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29704 unsigned NumMaskElts = Mask.size();
29705 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29707 // Determine the effective mask value type.
29708 FloatDomain &= (32 <= MaskEltSizeInBits);
29709 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29710 : MVT::getIntegerVT(MaskEltSizeInBits);
29711 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29713 // Only allow legal mask types.
29714 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29717 // Attempt to match the mask against known shuffle patterns.
29718 MVT ShuffleSrcVT, ShuffleVT;
29719 unsigned Shuffle, PermuteImm;
29721 // Which shuffle domains are permitted?
29722 // Permit domain crossing at higher combine depths.
29723 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29724 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29725 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29727 // Determine zeroable mask elements.
29728 APInt Zeroable(NumMaskElts, 0);
29729 for (unsigned i = 0; i != NumMaskElts; ++i)
29730 if (isUndefOrZero(Mask[i]))
29731 Zeroable.setBit(i);
29733 if (UnaryShuffle) {
29734 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29735 // directly if we don't shuffle the lower element and we shuffle the upper
29736 // (zero) elements within themselves.
29737 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29738 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29739 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29740 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29741 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29742 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29743 return DAG.getBitcast(RootVT, V1);
29747 SDValue NewV1 = V1; // Save operand in case early exit happens.
29748 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29749 NewV1, DL, DAG, Subtarget, Shuffle,
29750 ShuffleSrcVT, ShuffleVT) &&
29751 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29752 if (Depth == 1 && Root.getOpcode() == Shuffle)
29753 return SDValue(); // Nothing to do!
29754 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29755 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29756 return DAG.getBitcast(RootVT, Res);
29759 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29760 AllowIntDomain, Subtarget, Shuffle,
29761 ShuffleVT, PermuteImm) &&
29762 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29763 if (Depth == 1 && Root.getOpcode() == Shuffle)
29764 return SDValue(); // Nothing to do!
29765 Res = DAG.getBitcast(ShuffleVT, V1);
29766 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29767 DAG.getConstant(PermuteImm, DL, MVT::i8));
29768 return DAG.getBitcast(RootVT, Res);
29772 SDValue NewV1 = V1; // Save operands in case early exit happens.
29773 SDValue NewV2 = V2;
29774 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29775 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29776 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29777 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29778 if (Depth == 1 && Root.getOpcode() == Shuffle)
29779 return SDValue(); // Nothing to do!
29780 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29781 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29782 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29783 return DAG.getBitcast(RootVT, Res);
29786 NewV1 = V1; // Save operands in case early exit happens.
29788 if (matchBinaryPermuteVectorShuffle(
29789 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29790 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29791 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29792 if (Depth == 1 && Root.getOpcode() == Shuffle)
29793 return SDValue(); // Nothing to do!
29794 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29795 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29796 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29797 DAG.getConstant(PermuteImm, DL, MVT::i8));
29798 return DAG.getBitcast(RootVT, Res);
29801 // Typically from here on, we need an integer version of MaskVT.
29802 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29803 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29805 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29806 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29807 uint64_t BitLen, BitIdx;
29808 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29810 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29811 return SDValue(); // Nothing to do!
29812 V1 = DAG.getBitcast(IntMaskVT, V1);
29813 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29814 DAG.getConstant(BitLen, DL, MVT::i8),
29815 DAG.getConstant(BitIdx, DL, MVT::i8));
29816 return DAG.getBitcast(RootVT, Res);
29819 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29820 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29821 return SDValue(); // Nothing to do!
29822 V1 = DAG.getBitcast(IntMaskVT, V1);
29823 V2 = DAG.getBitcast(IntMaskVT, V2);
29824 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29825 DAG.getConstant(BitLen, DL, MVT::i8),
29826 DAG.getConstant(BitIdx, DL, MVT::i8));
29827 return DAG.getBitcast(RootVT, Res);
29831 // Don't try to re-form single instruction chains under any circumstances now
29832 // that we've done encoding canonicalization for them.
29836 // Depth threshold above which we can efficiently use variable mask shuffles.
29837 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29838 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29840 bool MaskContainsZeros =
29841 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29843 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29844 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29845 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29846 ((Subtarget.hasAVX2() &&
29847 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29848 (Subtarget.hasAVX512() &&
29849 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29850 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29851 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29852 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29853 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29854 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29855 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29856 Res = DAG.getBitcast(MaskVT, V1);
29857 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29858 return DAG.getBitcast(RootVT, Res);
29861 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29862 // vector as the second source.
29863 if (UnaryShuffle && AllowVariableMask &&
29864 ((Subtarget.hasAVX512() &&
29865 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29866 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29867 (Subtarget.hasVLX() &&
29868 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29869 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29870 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29871 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29872 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29873 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29874 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29875 for (unsigned i = 0; i != NumMaskElts; ++i)
29876 if (Mask[i] == SM_SentinelZero)
29877 Mask[i] = NumMaskElts + i;
29879 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29880 Res = DAG.getBitcast(MaskVT, V1);
29881 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29882 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29883 return DAG.getBitcast(RootVT, Res);
29886 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29887 if (AllowVariableMask && !MaskContainsZeros &&
29888 ((Subtarget.hasAVX512() &&
29889 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29890 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29891 (Subtarget.hasVLX() &&
29892 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29893 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29894 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29895 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29896 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29897 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29898 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29899 V1 = DAG.getBitcast(MaskVT, V1);
29900 V2 = DAG.getBitcast(MaskVT, V2);
29901 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29902 return DAG.getBitcast(RootVT, Res);
29907 // See if we can combine a single input shuffle with zeros to a bit-mask,
29908 // which is much simpler than any shuffle.
29909 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29910 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29911 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29912 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29913 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29914 APInt UndefElts(NumMaskElts, 0);
29915 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29916 for (unsigned i = 0; i != NumMaskElts; ++i) {
29918 if (M == SM_SentinelUndef) {
29919 UndefElts.setBit(i);
29922 if (M == SM_SentinelZero)
29924 EltBits[i] = AllOnes;
29926 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29927 Res = DAG.getBitcast(MaskVT, V1);
29928 unsigned AndOpcode =
29929 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29930 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29931 return DAG.getBitcast(RootVT, Res);
29934 // If we have a single input shuffle with different shuffle patterns in the
29935 // the 128-bit lanes use the variable mask to VPERMILPS.
29936 // TODO Combine other mask types at higher depths.
29937 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29938 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29939 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29940 SmallVector<SDValue, 16> VPermIdx;
29941 for (int M : Mask) {
29943 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29944 VPermIdx.push_back(Idx);
29946 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29947 Res = DAG.getBitcast(MaskVT, V1);
29948 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29949 return DAG.getBitcast(RootVT, Res);
29952 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29953 // to VPERMIL2PD/VPERMIL2PS.
29954 if (AllowVariableMask && Subtarget.hasXOP() &&
29955 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29956 MaskVT == MVT::v8f32)) {
29957 // VPERMIL2 Operation.
29958 // Bits[3] - Match Bit.
29959 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29960 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29961 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29962 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29963 SmallVector<int, 8> VPerm2Idx;
29964 unsigned M2ZImm = 0;
29965 for (int M : Mask) {
29966 if (M == SM_SentinelUndef) {
29967 VPerm2Idx.push_back(-1);
29970 if (M == SM_SentinelZero) {
29972 VPerm2Idx.push_back(8);
29975 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29976 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29977 VPerm2Idx.push_back(Index);
29979 V1 = DAG.getBitcast(MaskVT, V1);
29980 V2 = DAG.getBitcast(MaskVT, V2);
29981 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29982 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29983 DAG.getConstant(M2ZImm, DL, MVT::i8));
29984 return DAG.getBitcast(RootVT, Res);
29987 // If we have 3 or more shuffle instructions or a chain involving a variable
29988 // mask, we can replace them with a single PSHUFB instruction profitably.
29989 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29990 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29991 // more aggressive.
29992 if (UnaryShuffle && AllowVariableMask &&
29993 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29994 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29995 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29996 SmallVector<SDValue, 16> PSHUFBMask;
29997 int NumBytes = RootVT.getSizeInBits() / 8;
29998 int Ratio = NumBytes / NumMaskElts;
29999 for (int i = 0; i < NumBytes; ++i) {
30000 int M = Mask[i / Ratio];
30001 if (M == SM_SentinelUndef) {
30002 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
30005 if (M == SM_SentinelZero) {
30006 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
30009 M = Ratio * M + i % Ratio;
30010 assert((M / 16) == (i / 16) && "Lane crossing detected");
30011 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30013 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
30014 Res = DAG.getBitcast(ByteVT, V1);
30015 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
30016 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
30017 return DAG.getBitcast(RootVT, Res);
30020 // With XOP, if we have a 128-bit binary input shuffle we can always combine
30021 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
30022 // slower than PSHUFB on targets that support both.
30023 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
30024 // VPPERM Mask Operation
30025 // Bits[4:0] - Byte Index (0 - 31)
30026 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
30027 SmallVector<SDValue, 16> VPPERMMask;
30029 int Ratio = NumBytes / NumMaskElts;
30030 for (int i = 0; i < NumBytes; ++i) {
30031 int M = Mask[i / Ratio];
30032 if (M == SM_SentinelUndef) {
30033 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
30036 if (M == SM_SentinelZero) {
30037 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
30040 M = Ratio * M + i % Ratio;
30041 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30043 MVT ByteVT = MVT::v16i8;
30044 V1 = DAG.getBitcast(ByteVT, V1);
30045 V2 = DAG.getBitcast(ByteVT, V2);
30046 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
30047 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
30048 return DAG.getBitcast(RootVT, Res);
30051 // Failed to find any combines.
30055 // Attempt to constant fold all of the constant source ops.
30056 // Returns true if the entire shuffle is folded to a constant.
30057 // TODO: Extend this to merge multiple constant Ops and update the mask.
30058 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
30059 ArrayRef<int> Mask, SDValue Root,
30060 bool HasVariableMask,
30062 const X86Subtarget &Subtarget) {
30063 MVT VT = Root.getSimpleValueType();
30065 unsigned SizeInBits = VT.getSizeInBits();
30066 unsigned NumMaskElts = Mask.size();
30067 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
30068 unsigned NumOps = Ops.size();
30070 // Extract constant bits from each source op.
30071 bool OneUseConstantOp = false;
30072 SmallVector<APInt, 16> UndefEltsOps(NumOps);
30073 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
30074 for (unsigned i = 0; i != NumOps; ++i) {
30075 SDValue SrcOp = Ops[i];
30076 OneUseConstantOp |= SrcOp.hasOneUse();
30077 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
30082 // Only fold if at least one of the constants is only used once or
30083 // the combined shuffle has included a variable mask shuffle, this
30084 // is to avoid constant pool bloat.
30085 if (!OneUseConstantOp && !HasVariableMask)
30088 // Shuffle the constant bits according to the mask.
30089 APInt UndefElts(NumMaskElts, 0);
30090 APInt ZeroElts(NumMaskElts, 0);
30091 APInt ConstantElts(NumMaskElts, 0);
30092 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
30093 APInt::getNullValue(MaskSizeInBits));
30094 for (unsigned i = 0; i != NumMaskElts; ++i) {
30096 if (M == SM_SentinelUndef) {
30097 UndefElts.setBit(i);
30099 } else if (M == SM_SentinelZero) {
30100 ZeroElts.setBit(i);
30103 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
30105 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
30106 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
30108 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
30109 if (SrcUndefElts[SrcMaskIdx]) {
30110 UndefElts.setBit(i);
30114 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
30115 APInt &Bits = SrcEltBits[SrcMaskIdx];
30117 ZeroElts.setBit(i);
30121 ConstantElts.setBit(i);
30122 ConstantBitData[i] = Bits;
30124 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
30126 // Create the constant data.
30128 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
30129 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
30131 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
30133 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
30136 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
30137 return DAG.getBitcast(VT, CstOp);
30140 /// Fully generic combining of x86 shuffle instructions.
30142 /// This should be the last combine run over the x86 shuffle instructions. Once
30143 /// they have been fully optimized, this will recursively consider all chains
30144 /// of single-use shuffle instructions, build a generic model of the cumulative
30145 /// shuffle operation, and check for simpler instructions which implement this
30146 /// operation. We use this primarily for two purposes:
30148 /// 1) Collapse generic shuffles to specialized single instructions when
30149 /// equivalent. In most cases, this is just an encoding size win, but
30150 /// sometimes we will collapse multiple generic shuffles into a single
30151 /// special-purpose shuffle.
30152 /// 2) Look for sequences of shuffle instructions with 3 or more total
30153 /// instructions, and replace them with the slightly more expensive SSSE3
30154 /// PSHUFB instruction if available. We do this as the last combining step
30155 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30156 /// a suitable short sequence of other instructions. The PSHUFB will either
30157 /// use a register or have to read from memory and so is slightly (but only
30158 /// slightly) more expensive than the other shuffle instructions.
30160 /// Because this is inherently a quadratic operation (for each shuffle in
30161 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
30162 /// This should never be an issue in practice as the shuffle lowering doesn't
30163 /// produce sequences of more than 8 instructions.
30165 /// FIXME: We will currently miss some cases where the redundant shuffling
30166 /// would simplify under the threshold for PSHUFB formation because of
30167 /// combine-ordering. To fix this, we should do the redundant instruction
30168 /// combining in this recursive walk.
30169 static SDValue combineX86ShufflesRecursively(
30170 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
30171 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
30172 bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
30173 // Bound the depth of our recursive combine because this is ultimately
30174 // quadratic in nature.
30175 const unsigned MaxRecursionDepth = 8;
30176 if (Depth > MaxRecursionDepth)
30179 // Directly rip through bitcasts to find the underlying operand.
30180 SDValue Op = SrcOps[SrcOpIndex];
30181 Op = peekThroughOneUseBitcasts(Op);
30183 MVT VT = Op.getSimpleValueType();
30184 if (!VT.isVector())
30185 return SDValue(); // Bail if we hit a non-vector.
30187 assert(Root.getSimpleValueType().isVector() &&
30188 "Shuffles operate on vector types!");
30189 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
30190 "Can only combine shuffles of the same vector register size.");
30192 // Extract target shuffle mask and resolve sentinels and inputs.
30193 SmallVector<int, 64> OpMask;
30194 SmallVector<SDValue, 2> OpInputs;
30195 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
30198 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
30199 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
30200 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
30202 // Add the inputs to the Ops list, avoiding duplicates.
30203 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
30205 int InputIdx0 = -1, InputIdx1 = -1;
30206 for (int i = 0, e = Ops.size(); i < e; ++i) {
30207 SDValue BC = peekThroughBitcasts(Ops[i]);
30208 if (Input0 && BC == peekThroughBitcasts(Input0))
30210 if (Input1 && BC == peekThroughBitcasts(Input1))
30214 if (Input0 && InputIdx0 < 0) {
30215 InputIdx0 = SrcOpIndex;
30216 Ops[SrcOpIndex] = Input0;
30218 if (Input1 && InputIdx1 < 0) {
30219 InputIdx1 = Ops.size();
30220 Ops.push_back(Input1);
30223 assert(((RootMask.size() > OpMask.size() &&
30224 RootMask.size() % OpMask.size() == 0) ||
30225 (OpMask.size() > RootMask.size() &&
30226 OpMask.size() % RootMask.size() == 0) ||
30227 OpMask.size() == RootMask.size()) &&
30228 "The smaller number of elements must divide the larger.");
30230 // This function can be performance-critical, so we rely on the power-of-2
30231 // knowledge that we have about the mask sizes to replace div/rem ops with
30232 // bit-masks and shifts.
30233 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
30234 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
30235 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
30236 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
30238 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
30239 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
30240 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
30241 assert((RootRatio == 1 || OpRatio == 1) &&
30242 "Must not have a ratio for both incoming and op masks!");
30244 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
30245 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
30246 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
30247 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
30248 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
30250 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
30252 // Merge this shuffle operation's mask into our accumulated mask. Note that
30253 // this shuffle's mask will be the first applied to the input, followed by the
30254 // root mask to get us all the way to the root value arrangement. The reason
30255 // for this order is that we are recursing up the operation chain.
30256 for (unsigned i = 0; i < MaskWidth; ++i) {
30257 unsigned RootIdx = i >> RootRatioLog2;
30258 if (RootMask[RootIdx] < 0) {
30259 // This is a zero or undef lane, we're done.
30260 Mask[i] = RootMask[RootIdx];
30264 unsigned RootMaskedIdx =
30266 ? RootMask[RootIdx]
30267 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
30269 // Just insert the scaled root mask value if it references an input other
30270 // than the SrcOp we're currently inserting.
30271 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
30272 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
30273 Mask[i] = RootMaskedIdx;
30277 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
30278 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
30279 if (OpMask[OpIdx] < 0) {
30280 // The incoming lanes are zero or undef, it doesn't matter which ones we
30282 Mask[i] = OpMask[OpIdx];
30286 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
30287 unsigned OpMaskedIdx =
30290 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
30292 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
30293 if (OpMask[OpIdx] < (int)OpMask.size()) {
30294 assert(0 <= InputIdx0 && "Unknown target shuffle input");
30295 OpMaskedIdx += InputIdx0 * MaskWidth;
30297 assert(0 <= InputIdx1 && "Unknown target shuffle input");
30298 OpMaskedIdx += InputIdx1 * MaskWidth;
30301 Mask[i] = OpMaskedIdx;
30304 // Handle the all undef/zero cases early.
30305 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
30306 return DAG.getUNDEF(Root.getValueType());
30308 // TODO - should we handle the mixed zero/undef case as well? Just returning
30309 // a zero mask will lose information on undef elements possibly reducing
30310 // future combine possibilities.
30311 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
30312 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
30315 // Remove unused shuffle source ops.
30316 resolveTargetShuffleInputsAndMask(Ops, Mask);
30317 assert(!Ops.empty() && "Shuffle with no inputs detected");
30319 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
30321 // Update the list of shuffle nodes that have been combined so far.
30322 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
30324 CombinedNodes.push_back(Op.getNode());
30326 // See if we can recurse into each shuffle source op (if it's a target
30327 // shuffle). The source op should only be combined if it either has a
30328 // single use (i.e. current Op) or all its users have already been combined.
30329 // Don't recurse if we already have more source ops than we can combine in
30330 // the remaining recursion depth.
30331 if (Ops.size() < (MaxRecursionDepth - Depth)) {
30332 for (int i = 0, e = Ops.size(); i < e; ++i)
30333 if (Ops[i].getNode()->hasOneUse() ||
30334 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
30335 if (SDValue Res = combineX86ShufflesRecursively(
30336 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
30341 // Attempt to constant fold all of the constant source ops.
30342 if (SDValue Cst = combineX86ShufflesConstants(
30343 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
30346 // We can only combine unary and binary shuffle mask cases.
30347 if (Ops.size() > 2)
30350 // Minor canonicalization of the accumulated shuffle mask to make it easier
30351 // to match below. All this does is detect masks with sequential pairs of
30352 // elements, and shrink them to the half-width mask. It does this in a loop
30353 // so it will reduce the size of the mask to the minimal width mask which
30354 // performs an equivalent shuffle.
30355 SmallVector<int, 64> WidenedMask;
30356 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
30357 Mask = std::move(WidenedMask);
30360 // Canonicalization of binary shuffle masks to improve pattern matching by
30361 // commuting the inputs.
30362 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
30363 ShuffleVectorSDNode::commuteMask(Mask);
30364 std::swap(Ops[0], Ops[1]);
30367 // Finally, try to combine into a single shuffle instruction.
30368 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
30372 /// Get the PSHUF-style mask from PSHUF node.
30374 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
30375 /// PSHUF-style masks that can be reused with such instructions.
30376 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
30377 MVT VT = N.getSimpleValueType();
30378 SmallVector<int, 4> Mask;
30379 SmallVector<SDValue, 2> Ops;
30382 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
30386 // If we have more than 128-bits, only the low 128-bits of shuffle mask
30387 // matter. Check that the upper masks are repeats and remove them.
30388 if (VT.getSizeInBits() > 128) {
30389 int LaneElts = 128 / VT.getScalarSizeInBits();
30391 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
30392 for (int j = 0; j < LaneElts; ++j)
30393 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
30394 "Mask doesn't repeat in high 128-bit lanes!");
30396 Mask.resize(LaneElts);
30399 switch (N.getOpcode()) {
30400 case X86ISD::PSHUFD:
30402 case X86ISD::PSHUFLW:
30405 case X86ISD::PSHUFHW:
30406 Mask.erase(Mask.begin(), Mask.begin() + 4);
30407 for (int &M : Mask)
30411 llvm_unreachable("No valid shuffle instruction found!");
30415 /// Search for a combinable shuffle across a chain ending in pshufd.
30417 /// We walk up the chain and look for a combinable shuffle, skipping over
30418 /// shuffles that we could hoist this shuffle's transformation past without
30419 /// altering anything.
30421 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
30422 SelectionDAG &DAG) {
30423 assert(N.getOpcode() == X86ISD::PSHUFD &&
30424 "Called with something other than an x86 128-bit half shuffle!");
30427 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
30428 // of the shuffles in the chain so that we can form a fresh chain to replace
30430 SmallVector<SDValue, 8> Chain;
30431 SDValue V = N.getOperand(0);
30432 for (; V.hasOneUse(); V = V.getOperand(0)) {
30433 switch (V.getOpcode()) {
30435 return SDValue(); // Nothing combined!
30438 // Skip bitcasts as we always know the type for the target specific
30442 case X86ISD::PSHUFD:
30443 // Found another dword shuffle.
30446 case X86ISD::PSHUFLW:
30447 // Check that the low words (being shuffled) are the identity in the
30448 // dword shuffle, and the high words are self-contained.
30449 if (Mask[0] != 0 || Mask[1] != 1 ||
30450 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
30453 Chain.push_back(V);
30456 case X86ISD::PSHUFHW:
30457 // Check that the high words (being shuffled) are the identity in the
30458 // dword shuffle, and the low words are self-contained.
30459 if (Mask[2] != 2 || Mask[3] != 3 ||
30460 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
30463 Chain.push_back(V);
30466 case X86ISD::UNPCKL:
30467 case X86ISD::UNPCKH:
30468 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
30469 // shuffle into a preceding word shuffle.
30470 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
30471 V.getSimpleValueType().getVectorElementType() != MVT::i16)
30474 // Search for a half-shuffle which we can combine with.
30475 unsigned CombineOp =
30476 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
30477 if (V.getOperand(0) != V.getOperand(1) ||
30478 !V->isOnlyUserOf(V.getOperand(0).getNode()))
30480 Chain.push_back(V);
30481 V = V.getOperand(0);
30483 switch (V.getOpcode()) {
30485 return SDValue(); // Nothing to combine.
30487 case X86ISD::PSHUFLW:
30488 case X86ISD::PSHUFHW:
30489 if (V.getOpcode() == CombineOp)
30492 Chain.push_back(V);
30496 V = V.getOperand(0);
30500 } while (V.hasOneUse());
30503 // Break out of the loop if we break out of the switch.
30507 if (!V.hasOneUse())
30508 // We fell out of the loop without finding a viable combining instruction.
30511 // Merge this node's mask and our incoming mask.
30512 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30513 for (int &M : Mask)
30515 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
30516 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30518 // Rebuild the chain around this new shuffle.
30519 while (!Chain.empty()) {
30520 SDValue W = Chain.pop_back_val();
30522 if (V.getValueType() != W.getOperand(0).getValueType())
30523 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
30525 switch (W.getOpcode()) {
30527 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30529 case X86ISD::UNPCKL:
30530 case X86ISD::UNPCKH:
30531 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30534 case X86ISD::PSHUFD:
30535 case X86ISD::PSHUFLW:
30536 case X86ISD::PSHUFHW:
30537 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30541 if (V.getValueType() != N.getValueType())
30542 V = DAG.getBitcast(N.getValueType(), V);
30544 // Return the new chain to replace N.
30548 /// Search for a combinable shuffle across a chain ending in pshuflw or
30551 /// We walk up the chain, skipping shuffles of the other half and looking
30552 /// through shuffles which switch halves trying to find a shuffle of the same
30553 /// pair of dwords.
30554 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30556 TargetLowering::DAGCombinerInfo &DCI) {
30558 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30559 "Called with something other than an x86 128-bit half shuffle!");
30561 unsigned CombineOpcode = N.getOpcode();
30563 // Walk up a single-use chain looking for a combinable shuffle.
30564 SDValue V = N.getOperand(0);
30565 for (; V.hasOneUse(); V = V.getOperand(0)) {
30566 switch (V.getOpcode()) {
30568 return false; // Nothing combined!
30571 // Skip bitcasts as we always know the type for the target specific
30575 case X86ISD::PSHUFLW:
30576 case X86ISD::PSHUFHW:
30577 if (V.getOpcode() == CombineOpcode)
30580 // Other-half shuffles are no-ops.
30583 // Break out of the loop if we break out of the switch.
30587 if (!V.hasOneUse())
30588 // We fell out of the loop without finding a viable combining instruction.
30591 // Combine away the bottom node as its shuffle will be accumulated into
30592 // a preceding shuffle.
30593 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30595 // Record the old value.
30598 // Merge this node's mask and our incoming mask (adjusted to account for all
30599 // the pshufd instructions encountered).
30600 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30601 for (int &M : Mask)
30603 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30604 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30606 // Check that the shuffles didn't cancel each other out. If not, we need to
30607 // combine to the new one.
30609 // Replace the combinable shuffle with the combined one, updating all users
30610 // so that we re-evaluate the chain here.
30611 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30616 /// Try to combine x86 target specific shuffles.
30617 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30618 TargetLowering::DAGCombinerInfo &DCI,
30619 const X86Subtarget &Subtarget) {
30621 MVT VT = N.getSimpleValueType();
30622 SmallVector<int, 4> Mask;
30623 unsigned Opcode = N.getOpcode();
30625 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30626 // single instruction.
30627 if (VT.getScalarSizeInBits() == 64 &&
30628 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30629 Opcode == X86ISD::UNPCKL)) {
30630 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30631 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30632 EVT VT0 = BC0.getValueType();
30633 EVT VT1 = BC1.getValueType();
30634 unsigned Opcode0 = BC0.getOpcode();
30635 unsigned Opcode1 = BC1.getOpcode();
30636 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30637 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30638 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30639 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30641 if (Opcode == X86ISD::MOVSD) {
30642 Lo = BC1.getOperand(0);
30643 Hi = BC0.getOperand(1);
30645 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30646 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30648 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30649 return DAG.getBitcast(VT, Horiz);
30654 case X86ISD::VBROADCAST: {
30655 // If broadcasting from another shuffle, attempt to simplify it.
30656 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30657 SDValue Src = N.getOperand(0);
30658 SDValue BC = peekThroughBitcasts(Src);
30659 EVT SrcVT = Src.getValueType();
30660 EVT BCVT = BC.getValueType();
30661 if (isTargetShuffle(BC.getOpcode()) &&
30662 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30663 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30664 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30666 for (unsigned i = 0; i != Scale; ++i)
30667 DemandedMask[i] = i;
30668 if (SDValue Res = combineX86ShufflesRecursively(
30669 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30670 /*HasVarMask*/ false, DAG, Subtarget))
30671 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30672 DAG.getBitcast(SrcVT, Res));
30676 case X86ISD::PSHUFD:
30677 case X86ISD::PSHUFLW:
30678 case X86ISD::PSHUFHW:
30679 Mask = getPSHUFShuffleMask(N);
30680 assert(Mask.size() == 4);
30682 case X86ISD::UNPCKL: {
30683 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30684 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30685 // moves upper half elements into the lower half part. For example:
30687 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30689 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30691 // will be combined to:
30693 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30695 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30696 // happen due to advanced instructions.
30697 if (!VT.is128BitVector())
30700 auto Op0 = N.getOperand(0);
30701 auto Op1 = N.getOperand(1);
30702 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30703 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30705 unsigned NumElts = VT.getVectorNumElements();
30706 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30707 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30710 auto ShufOp = Op1.getOperand(0);
30711 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30712 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30716 case X86ISD::MOVSD:
30717 case X86ISD::MOVSS: {
30718 SDValue N0 = N.getOperand(0);
30719 SDValue N1 = N.getOperand(1);
30721 // Canonicalize scalar FPOps:
30722 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
30723 // If commutable, allow OP(N1[0], N0[0]).
30724 unsigned Opcode1 = N1.getOpcode();
30725 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
30726 Opcode1 == ISD::FDIV) {
30727 SDValue N10 = N1.getOperand(0);
30728 SDValue N11 = N1.getOperand(1);
30730 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
30732 std::swap(N10, N11);
30733 MVT SVT = VT.getVectorElementType();
30734 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
30735 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
30736 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
30737 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
30738 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
30739 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
30745 case X86ISD::INSERTPS: {
30746 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30747 SDValue Op0 = N.getOperand(0);
30748 SDValue Op1 = N.getOperand(1);
30749 SDValue Op2 = N.getOperand(2);
30750 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30751 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30752 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30753 unsigned ZeroMask = InsertPSMask & 0xF;
30755 // If we zero out all elements from Op0 then we don't need to reference it.
30756 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30757 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30758 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30760 // If we zero out the element from Op1 then we don't need to reference it.
30761 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30762 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30763 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30765 // Attempt to merge insertps Op1 with an inner target shuffle node.
30766 SmallVector<int, 8> TargetMask1;
30767 SmallVector<SDValue, 2> Ops1;
30768 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30769 int M = TargetMask1[SrcIdx];
30770 if (isUndefOrZero(M)) {
30771 // Zero/UNDEF insertion - zero out element and remove dependency.
30772 InsertPSMask |= (1u << DstIdx);
30773 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30774 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30776 // Update insertps mask srcidx and reference the source input directly.
30777 assert(0 <= M && M < 8 && "Shuffle index out of range");
30778 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30779 Op1 = Ops1[M < 4 ? 0 : 1];
30780 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30781 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30784 // Attempt to merge insertps Op0 with an inner target shuffle node.
30785 SmallVector<int, 8> TargetMask0;
30786 SmallVector<SDValue, 2> Ops0;
30787 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30790 bool Updated = false;
30791 bool UseInput00 = false;
30792 bool UseInput01 = false;
30793 for (int i = 0; i != 4; ++i) {
30794 int M = TargetMask0[i];
30795 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30796 // No change if element is already zero or the inserted element.
30798 } else if (isUndefOrZero(M)) {
30799 // If the target mask is undef/zero then we must zero the element.
30800 InsertPSMask |= (1u << i);
30805 // The input vector element must be inline.
30806 if (M != i && M != (i + 4))
30809 // Determine which inputs of the target shuffle we're using.
30810 UseInput00 |= (0 <= M && M < 4);
30811 UseInput01 |= (4 <= M);
30814 // If we're not using both inputs of the target shuffle then use the
30815 // referenced input directly.
30816 if (UseInput00 && !UseInput01) {
30819 } else if (!UseInput00 && UseInput01) {
30825 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30826 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30834 // Nuke no-op shuffles that show up after combining.
30835 if (isNoopShuffleMask(Mask))
30836 return N.getOperand(0);
30838 // Look for simplifications involving one or two shuffle instructions.
30839 SDValue V = N.getOperand(0);
30840 switch (N.getOpcode()) {
30843 case X86ISD::PSHUFLW:
30844 case X86ISD::PSHUFHW:
30845 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30847 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30848 return SDValue(); // We combined away this shuffle, so we're done.
30850 // See if this reduces to a PSHUFD which is no more expensive and can
30851 // combine with more operations. Note that it has to at least flip the
30852 // dwords as otherwise it would have been removed as a no-op.
30853 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30854 int DMask[] = {0, 1, 2, 3};
30855 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30856 DMask[DOffset + 0] = DOffset + 1;
30857 DMask[DOffset + 1] = DOffset + 0;
30858 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30859 V = DAG.getBitcast(DVT, V);
30860 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30861 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30862 return DAG.getBitcast(VT, V);
30865 // Look for shuffle patterns which can be implemented as a single unpack.
30866 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30867 // only works when we have a PSHUFD followed by two half-shuffles.
30868 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30869 (V.getOpcode() == X86ISD::PSHUFLW ||
30870 V.getOpcode() == X86ISD::PSHUFHW) &&
30871 V.getOpcode() != N.getOpcode() &&
30873 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30874 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30875 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30876 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30877 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30878 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30880 for (int i = 0; i < 4; ++i) {
30881 WordMask[i + NOffset] = Mask[i] + NOffset;
30882 WordMask[i + VOffset] = VMask[i] + VOffset;
30884 // Map the word mask through the DWord mask.
30886 for (int i = 0; i < 8; ++i)
30887 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30888 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30889 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30890 // We can replace all three shuffles with an unpack.
30891 V = DAG.getBitcast(VT, D.getOperand(0));
30892 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30901 case X86ISD::PSHUFD:
30902 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30911 /// Checks if the shuffle mask takes subsequent elements
30912 /// alternately from two vectors.
30913 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
30914 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
30916 int ParitySrc[2] = {-1, -1};
30917 unsigned Size = Mask.size();
30918 for (unsigned i = 0; i != Size; ++i) {
30923 // Make sure we are using the matching element from the input.
30924 if ((M % Size) != i)
30927 // Make sure we use the same input for all elements of the same parity.
30928 int Src = M / Size;
30929 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30931 ParitySrc[i % 2] = Src;
30934 // Make sure each input is used.
30935 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30938 Op0Even = ParitySrc[0] == 0;
30942 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30943 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30944 /// are written to the parameters \p Opnd0 and \p Opnd1.
30946 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30947 /// so it is easier to generically match. We also insert dummy vector shuffle
30948 /// nodes for the operands which explicitly discard the lanes which are unused
30949 /// by this operation to try to flow through the rest of the combiner
30950 /// the fact that they're unused.
30951 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30952 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30955 EVT VT = N->getValueType(0);
30956 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30957 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30958 !VT.getSimpleVT().isFloatingPoint())
30961 // We only handle target-independent shuffles.
30962 // FIXME: It would be easy and harmless to use the target shuffle mask
30963 // extraction tool to support more.
30964 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30967 SDValue V1 = N->getOperand(0);
30968 SDValue V2 = N->getOperand(1);
30970 // Make sure we have an FADD and an FSUB.
30971 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30972 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30973 V1.getOpcode() == V2.getOpcode())
30976 // If there are other uses of these operations we can't fold them.
30977 if (!V1->hasOneUse() || !V2->hasOneUse())
30980 // Ensure that both operations have the same operands. Note that we can
30981 // commute the FADD operands.
30983 if (V1.getOpcode() == ISD::FSUB) {
30984 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30985 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30986 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30989 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
30990 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
30991 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
30992 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
30996 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30998 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31001 // It's a subadd if the vector in the even parity is an FADD.
31002 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
31003 : V2->getOpcode() == ISD::FADD;
31010 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
31011 static SDValue combineShuffleToFMAddSub(SDNode *N,
31012 const X86Subtarget &Subtarget,
31013 SelectionDAG &DAG) {
31014 // We only handle target-independent shuffles.
31015 // FIXME: It would be easy and harmless to use the target shuffle mask
31016 // extraction tool to support more.
31017 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
31020 MVT VT = N->getSimpleValueType(0);
31021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31022 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
31025 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
31026 SDValue Op0 = N->getOperand(0);
31027 SDValue Op1 = N->getOperand(1);
31028 SDValue FMAdd = Op0, FMSub = Op1;
31029 if (FMSub.getOpcode() != X86ISD::FMSUB)
31030 std::swap(FMAdd, FMSub);
31032 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
31033 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
31034 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
31035 FMAdd.getOperand(2) != FMSub.getOperand(2))
31038 // Check for correct shuffle mask.
31039 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31041 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31044 // FMAddSub takes zeroth operand from FMSub node.
31046 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
31047 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31048 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
31049 FMAdd.getOperand(2));
31052 /// Try to combine a shuffle into a target-specific add-sub or
31053 /// mul-add-sub node.
31054 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
31055 const X86Subtarget &Subtarget,
31056 SelectionDAG &DAG) {
31057 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
31060 SDValue Opnd0, Opnd1;
31062 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
31065 MVT VT = N->getSimpleValueType(0);
31068 // Try to generate X86ISD::FMADDSUB node here.
31070 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
31071 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31072 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
31078 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
31079 // the ADDSUB idiom has been successfully recognized. There are no known
31080 // X86 targets with 512-bit ADDSUB instructions!
31081 if (VT.is512BitVector())
31084 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
31087 // We are looking for a shuffle where both sources are concatenated with undef
31088 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
31089 // if we can express this as a single-source shuffle, that's preferable.
31090 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
31091 const X86Subtarget &Subtarget) {
31092 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
31095 EVT VT = N->getValueType(0);
31097 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
31098 if (!VT.is128BitVector() && !VT.is256BitVector())
31101 if (VT.getVectorElementType() != MVT::i32 &&
31102 VT.getVectorElementType() != MVT::i64 &&
31103 VT.getVectorElementType() != MVT::f32 &&
31104 VT.getVectorElementType() != MVT::f64)
31107 SDValue N0 = N->getOperand(0);
31108 SDValue N1 = N->getOperand(1);
31110 // Check that both sources are concats with undef.
31111 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
31112 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
31113 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
31114 !N1.getOperand(1).isUndef())
31117 // Construct the new shuffle mask. Elements from the first source retain their
31118 // index, but elements from the second source no longer need to skip an undef.
31119 SmallVector<int, 8> Mask;
31120 int NumElts = VT.getVectorNumElements();
31122 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31123 for (int Elt : SVOp->getMask())
31124 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
31127 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
31129 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
31132 /// Eliminate a redundant shuffle of a horizontal math op.
31133 static SDValue foldShuffleOfHorizOp(SDNode *N) {
31134 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
31137 SDValue HOp = N->getOperand(0);
31138 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
31139 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
31142 // 128-bit horizontal math instructions are defined to operate on adjacent
31143 // lanes of each operand as:
31144 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
31145 // ...similarly for v2f64 and v8i16.
31146 // TODO: Handle UNDEF operands.
31147 if (HOp.getOperand(0) != HOp.getOperand(1))
31150 // When the operands of a horizontal math op are identical, the low half of
31151 // the result is the same as the high half. If the shuffle is also replicating
31152 // low and high halves, we don't need the shuffle.
31153 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
31154 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31155 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
31156 // but this should be tied to whatever horizontal op matching and shuffle
31157 // canonicalization are producing.
31158 if (HOp.getValueSizeInBits() == 128 &&
31159 (isTargetShuffleEquivalent(Mask, {0, 0}) ||
31160 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
31161 isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
31164 if (HOp.getValueSizeInBits() == 256 &&
31165 (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
31166 isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
31167 isTargetShuffleEquivalent(
31168 Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
31174 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
31175 TargetLowering::DAGCombinerInfo &DCI,
31176 const X86Subtarget &Subtarget) {
31178 EVT VT = N->getValueType(0);
31179 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31180 // If we have legalized the vector types, look for blends of FADD and FSUB
31181 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
31182 if (TLI.isTypeLegal(VT)) {
31183 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
31186 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
31190 // During Type Legalization, when promoting illegal vector types,
31191 // the backend might introduce new shuffle dag nodes and bitcasts.
31193 // This code performs the following transformation:
31194 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
31195 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
31197 // We do this only if both the bitcast and the BINOP dag nodes have
31198 // one use. Also, perform this transformation only if the new binary
31199 // operation is legal. This is to avoid introducing dag nodes that
31200 // potentially need to be further expanded (or custom lowered) into a
31201 // less optimal sequence of dag nodes.
31202 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
31203 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
31204 N->getOperand(0).getOpcode() == ISD::BITCAST &&
31205 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
31206 SDValue N0 = N->getOperand(0);
31207 SDValue N1 = N->getOperand(1);
31209 SDValue BC0 = N0.getOperand(0);
31210 EVT SVT = BC0.getValueType();
31211 unsigned Opcode = BC0.getOpcode();
31212 unsigned NumElts = VT.getVectorNumElements();
31214 if (BC0.hasOneUse() && SVT.isVector() &&
31215 SVT.getVectorNumElements() * 2 == NumElts &&
31216 TLI.isOperationLegal(Opcode, VT)) {
31217 bool CanFold = false;
31223 // isOperationLegal lies for integer ops on floating point types.
31224 CanFold = VT.isInteger();
31229 // isOperationLegal lies for floating point ops on integer types.
31230 CanFold = VT.isFloatingPoint();
31234 unsigned SVTNumElts = SVT.getVectorNumElements();
31235 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31236 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
31237 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
31238 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
31239 CanFold = SVOp->getMaskElt(i) < 0;
31242 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
31243 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
31244 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
31245 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
31250 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
31251 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
31252 // consecutive, non-overlapping, and in the right order.
31253 SmallVector<SDValue, 16> Elts;
31254 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
31255 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
31256 Elts.push_back(Elt);
31263 if (Elts.size() == VT.getVectorNumElements())
31265 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
31268 // For AVX2, we sometimes want to combine
31269 // (vector_shuffle <mask> (concat_vectors t1, undef)
31270 // (concat_vectors t2, undef))
31272 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
31273 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
31274 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
31277 if (isTargetShuffle(N->getOpcode())) {
31279 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
31282 // Try recursively combining arbitrary sequences of x86 shuffle
31283 // instructions into higher-order shuffles. We do this after combining
31284 // specific PSHUF instruction sequences into their minimal form so that we
31285 // can evaluate how many specialized shuffle instructions are involved in
31286 // a particular chain.
31287 if (SDValue Res = combineX86ShufflesRecursively(
31288 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31289 /*HasVarMask*/ false, DAG, Subtarget))
31296 /// Check if a vector extract from a target-specific shuffle of a load can be
31297 /// folded into a single element load.
31298 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
31299 /// shuffles have been custom lowered so we need to handle those here.
31300 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
31301 TargetLowering::DAGCombinerInfo &DCI) {
31302 if (DCI.isBeforeLegalizeOps())
31305 SDValue InVec = N->getOperand(0);
31306 SDValue EltNo = N->getOperand(1);
31307 EVT EltVT = N->getValueType(0);
31309 if (!isa<ConstantSDNode>(EltNo))
31312 EVT OriginalVT = InVec.getValueType();
31314 // Peek through bitcasts, don't duplicate a load with other uses.
31315 InVec = peekThroughOneUseBitcasts(InVec);
31317 EVT CurrentVT = InVec.getValueType();
31318 if (!CurrentVT.isVector() ||
31319 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
31322 if (!isTargetShuffle(InVec.getOpcode()))
31325 // Don't duplicate a load with other uses.
31326 if (!InVec.hasOneUse())
31329 SmallVector<int, 16> ShuffleMask;
31330 SmallVector<SDValue, 2> ShuffleOps;
31332 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
31333 ShuffleOps, ShuffleMask, UnaryShuffle))
31336 // Select the input vector, guarding against out of range extract vector.
31337 unsigned NumElems = CurrentVT.getVectorNumElements();
31338 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
31339 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
31341 if (Idx == SM_SentinelZero)
31342 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
31343 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
31344 if (Idx == SM_SentinelUndef)
31345 return DAG.getUNDEF(EltVT);
31347 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
31348 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
31351 // If inputs to shuffle are the same for both ops, then allow 2 uses
31352 unsigned AllowedUses =
31353 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
31355 if (LdNode.getOpcode() == ISD::BITCAST) {
31356 // Don't duplicate a load with other uses.
31357 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
31360 AllowedUses = 1; // only allow 1 load use if we have a bitcast
31361 LdNode = LdNode.getOperand(0);
31364 if (!ISD::isNormalLoad(LdNode.getNode()))
31367 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
31369 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
31372 // If there's a bitcast before the shuffle, check if the load type and
31373 // alignment is valid.
31374 unsigned Align = LN0->getAlignment();
31375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31376 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
31377 EltVT.getTypeForEVT(*DAG.getContext()));
31379 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
31382 // All checks match so transform back to vector_shuffle so that DAG combiner
31383 // can finish the job
31386 // Create shuffle node taking into account the case that its a unary shuffle
31387 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
31388 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
31390 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
31391 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
31395 // Try to match patterns such as
31396 // (i16 bitcast (v16i1 x))
31398 // (i16 movmsk (16i8 sext (v16i1 x)))
31399 // before the illegal vector is scalarized on subtargets that don't have legal
31401 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
31402 const X86Subtarget &Subtarget) {
31403 EVT VT = BitCast.getValueType();
31404 SDValue N0 = BitCast.getOperand(0);
31405 EVT VecVT = N0->getValueType(0);
31407 if (!VT.isScalarInteger() || !VecVT.isSimple())
31410 // With AVX512 vxi1 types are legal and we prefer using k-regs.
31411 // MOVMSK is supported in SSE2 or later.
31412 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
31415 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
31416 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
31417 // v8i16 and v16i16.
31418 // For these two cases, we can shuffle the upper element bytes to a
31419 // consecutive sequence at the start of the vector and treat the results as
31420 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
31421 // for v16i16 this is not the case, because the shuffle is expensive, so we
31422 // avoid sign-extending to this type entirely.
31423 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
31424 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
31426 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
31427 switch (VecVT.getSimpleVT().SimpleTy) {
31431 SExtVT = MVT::v2i64;
31432 FPCastVT = MVT::v2f64;
31435 SExtVT = MVT::v4i32;
31436 FPCastVT = MVT::v4f32;
31437 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
31438 // sign-extend to a 256-bit operation to avoid truncation.
31439 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31440 N0->getOperand(0).getValueType().is256BitVector()) {
31441 SExtVT = MVT::v4i64;
31442 FPCastVT = MVT::v4f64;
31446 SExtVT = MVT::v8i16;
31447 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
31448 // sign-extend to a 256-bit operation to match the compare.
31449 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
31450 // 256-bit because the shuffle is cheaper than sign extending the result of
31452 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31453 (N0->getOperand(0).getValueType().is256BitVector() ||
31454 N0->getOperand(0).getValueType().is512BitVector())) {
31455 SExtVT = MVT::v8i32;
31456 FPCastVT = MVT::v8f32;
31460 SExtVT = MVT::v16i8;
31461 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
31462 // it is not profitable to sign-extend to 256-bit because this will
31463 // require an extra cross-lane shuffle which is more expensive than
31464 // truncating the result of the compare to 128-bits.
31467 SExtVT = MVT::v32i8;
31472 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
31474 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
31475 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31476 return DAG.getZExtOrTrunc(V, DL, VT);
31479 if (SExtVT == MVT::v8i16) {
31480 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
31481 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
31482 DAG.getUNDEF(MVT::v8i16));
31484 assert(SExtVT.getScalarType() != MVT::i16 &&
31485 "Vectors of i16 must be packed");
31486 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
31487 V = DAG.getBitcast(FPCastVT, V);
31488 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31489 return DAG.getZExtOrTrunc(V, DL, VT);
31492 // Convert a vXi1 constant build vector to the same width scalar integer.
31493 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
31494 EVT SrcVT = Op.getValueType();
31495 assert(SrcVT.getVectorElementType() == MVT::i1 &&
31496 "Expected a vXi1 vector");
31497 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
31498 "Expected a constant build vector");
31500 APInt Imm(SrcVT.getVectorNumElements(), 0);
31501 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
31502 SDValue In = Op.getOperand(Idx);
31503 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
31506 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
31507 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
31510 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31511 TargetLowering::DAGCombinerInfo &DCI,
31512 const X86Subtarget &Subtarget) {
31513 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
31515 if (!DCI.isBeforeLegalizeOps())
31518 // Only do this if we have k-registers.
31519 if (!Subtarget.hasAVX512())
31522 EVT DstVT = N->getValueType(0);
31523 SDValue Op = N->getOperand(0);
31524 EVT SrcVT = Op.getValueType();
31526 if (!Op.hasOneUse())
31529 // Look for logic ops.
31530 if (Op.getOpcode() != ISD::AND &&
31531 Op.getOpcode() != ISD::OR &&
31532 Op.getOpcode() != ISD::XOR)
31535 // Make sure we have a bitcast between mask registers and a scalar type.
31536 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31537 DstVT.isScalarInteger()) &&
31538 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
31539 SrcVT.isScalarInteger()))
31542 SDValue LHS = Op.getOperand(0);
31543 SDValue RHS = Op.getOperand(1);
31545 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
31546 LHS.getOperand(0).getValueType() == DstVT)
31547 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
31548 DAG.getBitcast(DstVT, RHS));
31550 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
31551 RHS.getOperand(0).getValueType() == DstVT)
31552 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31553 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
31555 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
31556 // Most of these have to move a constant from the scalar domain anyway.
31557 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
31558 RHS = combinevXi1ConstantToInteger(RHS, DAG);
31559 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31560 DAG.getBitcast(DstVT, LHS), RHS);
31566 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31567 const X86Subtarget &Subtarget) {
31569 unsigned NumElts = N.getNumOperands();
31571 auto *BV = cast<BuildVectorSDNode>(N);
31572 SDValue Splat = BV->getSplatValue();
31574 // Build MMX element from integer GPR or SSE float values.
31575 auto CreateMMXElement = [&](SDValue V) {
31577 return DAG.getUNDEF(MVT::x86mmx);
31578 if (V.getValueType().isFloatingPoint()) {
31579 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31580 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31581 V = DAG.getBitcast(MVT::v2i64, V);
31582 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31584 V = DAG.getBitcast(MVT::i32, V);
31586 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31588 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31591 // Convert build vector ops to MMX data in the bottom elements.
31592 SmallVector<SDValue, 8> Ops;
31594 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31596 if (Splat.isUndef())
31597 return DAG.getUNDEF(MVT::x86mmx);
31599 Splat = CreateMMXElement(Splat);
31601 if (Subtarget.hasSSE1()) {
31602 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31604 Splat = DAG.getNode(
31605 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31606 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31609 // Use PSHUFW to repeat 16-bit elements.
31610 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31611 return DAG.getNode(
31612 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31613 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31614 DAG.getConstant(ShufMask, DL, MVT::i8));
31616 Ops.append(NumElts, Splat);
31618 for (unsigned i = 0; i != NumElts; ++i)
31619 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31622 // Use tree of PUNPCKLs to build up general MMX vector.
31623 while (Ops.size() > 1) {
31624 unsigned NumOps = Ops.size();
31625 unsigned IntrinOp =
31626 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31627 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31628 : Intrinsic::x86_mmx_punpcklbw));
31629 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31630 for (unsigned i = 0; i != NumOps; i += 2)
31631 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31632 Ops[i], Ops[i + 1]);
31633 Ops.resize(NumOps / 2);
31639 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31640 TargetLowering::DAGCombinerInfo &DCI,
31641 const X86Subtarget &Subtarget) {
31642 SDValue N0 = N->getOperand(0);
31643 EVT VT = N->getValueType(0);
31644 EVT SrcVT = N0.getValueType();
31646 // Try to match patterns such as
31647 // (i16 bitcast (v16i1 x))
31649 // (i16 movmsk (16i8 sext (v16i1 x)))
31650 // before the setcc result is scalarized on subtargets that don't have legal
31652 if (DCI.isBeforeLegalize()) {
31653 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31656 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31657 // type, widen both sides to avoid a trip through memory.
31658 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31659 Subtarget.hasAVX512()) {
31661 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31662 N0 = DAG.getBitcast(MVT::v8i1, N0);
31663 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31664 DAG.getIntPtrConstant(0, dl));
31667 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31668 // type, widen both sides to avoid a trip through memory.
31669 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31670 Subtarget.hasAVX512()) {
31672 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31673 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31675 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31676 N0 = DAG.getBitcast(MVT::i8, N0);
31677 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31681 // Since MMX types are special and don't usually play with other vector types,
31682 // it's better to handle them early to be sure we emit efficient code by
31683 // avoiding store-load conversions.
31684 if (VT == MVT::x86mmx) {
31685 // Detect MMX constant vectors.
31687 SmallVector<APInt, 1> EltBits;
31688 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31690 // Handle zero-extension of i32 with MOVD.
31691 if (EltBits[0].countLeadingZeros() >= 32)
31692 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31693 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31694 // Else, bitcast to a double.
31695 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31696 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31697 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31700 // Detect bitcasts to x86mmx low word.
31701 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31702 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31703 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31704 bool LowUndef = true, AllUndefOrZero = true;
31705 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31706 SDValue Op = N0.getOperand(i);
31707 LowUndef &= Op.isUndef() || (i >= e/2);
31708 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31710 if (AllUndefOrZero) {
31711 SDValue N00 = N0.getOperand(0);
31713 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31714 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31715 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31719 // Detect bitcasts of 64-bit build vectors and convert to a
31720 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31722 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31723 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31724 SrcVT == MVT::v8i8))
31725 return createMMXBuildVector(N0, DAG, Subtarget);
31727 // Detect bitcasts between element or subvector extraction to x86mmx.
31728 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31729 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31730 isNullConstant(N0.getOperand(1))) {
31731 SDValue N00 = N0.getOperand(0);
31732 if (N00.getValueType().is128BitVector())
31733 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31734 DAG.getBitcast(MVT::v2i64, N00));
31737 // Detect bitcasts from FP_TO_SINT to x86mmx.
31738 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31740 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31741 DAG.getUNDEF(MVT::v2i32));
31742 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31743 DAG.getBitcast(MVT::v2i64, Res));
31747 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31748 // most of these to scalar anyway.
31749 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31750 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31751 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31752 return combinevXi1ConstantToInteger(N0, DAG);
31755 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
31756 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
31757 isa<ConstantSDNode>(N0)) {
31758 auto *C = cast<ConstantSDNode>(N0);
31759 if (C->isAllOnesValue())
31760 return DAG.getConstant(1, SDLoc(N0), VT);
31761 if (C->isNullValue())
31762 return DAG.getConstant(0, SDLoc(N0), VT);
31765 // Try to remove bitcasts from input and output of mask arithmetic to
31766 // remove GPR<->K-register crossings.
31767 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31770 // Convert a bitcasted integer logic operation that has one bitcasted
31771 // floating-point operand into a floating-point logic operation. This may
31772 // create a load of a constant, but that is cheaper than materializing the
31773 // constant in an integer register and transferring it to an SSE register or
31774 // transferring the SSE operand to integer register and back.
31776 switch (N0.getOpcode()) {
31777 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31778 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31779 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31780 default: return SDValue();
31783 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31784 (Subtarget.hasSSE2() && VT == MVT::f64)))
31787 SDValue LogicOp0 = N0.getOperand(0);
31788 SDValue LogicOp1 = N0.getOperand(1);
31791 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31792 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31793 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31794 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31795 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31796 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31798 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31799 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31800 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31801 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31802 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31803 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31809 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31810 // the elements of a vector.
31811 // Returns the vector that is being reduced on, or SDValue() if a reduction
31812 // was not matched.
31813 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31814 ArrayRef<ISD::NodeType> CandidateBinOps) {
31815 // The pattern must end in an extract from index 0.
31816 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31817 !isNullConstant(Extract->getOperand(1)))
31820 SDValue Op = Extract->getOperand(0);
31821 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31823 // Match against one of the candidate binary ops.
31824 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31825 return Op.getOpcode() == unsigned(BinOp);
31829 // At each stage, we're looking for something that looks like:
31830 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31831 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31832 // i32 undef, i32 undef, i32 undef, i32 undef>
31833 // %a = binop <8 x i32> %op, %s
31834 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31835 // we expect something like:
31836 // <4,5,6,7,u,u,u,u>
31837 // <2,3,u,u,u,u,u,u>
31838 // <1,u,u,u,u,u,u,u>
31839 unsigned CandidateBinOp = Op.getOpcode();
31840 for (unsigned i = 0; i < Stages; ++i) {
31841 if (Op.getOpcode() != CandidateBinOp)
31844 ShuffleVectorSDNode *Shuffle =
31845 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31847 Op = Op.getOperand(1);
31849 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31850 Op = Op.getOperand(0);
31853 // The first operand of the shuffle should be the same as the other operand
31855 if (!Shuffle || Shuffle->getOperand(0) != Op)
31858 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31859 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31860 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31864 BinOp = CandidateBinOp;
31868 // Given a select, detect the following pattern:
31869 // 1: %2 = zext <N x i8> %0 to <N x i32>
31870 // 2: %3 = zext <N x i8> %1 to <N x i32>
31871 // 3: %4 = sub nsw <N x i32> %2, %3
31872 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31873 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31874 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31875 // This is useful as it is the input into a SAD pattern.
31876 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31878 // Check the condition of the select instruction is greater-than.
31879 SDValue SetCC = Select->getOperand(0);
31880 if (SetCC.getOpcode() != ISD::SETCC)
31882 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31883 if (CC != ISD::SETGT && CC != ISD::SETLT)
31886 SDValue SelectOp1 = Select->getOperand(1);
31887 SDValue SelectOp2 = Select->getOperand(2);
31889 // The following instructions assume SelectOp1 is the subtraction operand
31890 // and SelectOp2 is the negation operand.
31891 // In the case of SETLT this is the other way around.
31892 if (CC == ISD::SETLT)
31893 std::swap(SelectOp1, SelectOp2);
31895 // The second operand of the select should be the negation of the first
31896 // operand, which is implemented as 0 - SelectOp1.
31897 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31898 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31899 SelectOp2.getOperand(1) == SelectOp1))
31902 // The first operand of SetCC is the first operand of the select, which is the
31903 // difference between the two input vectors.
31904 if (SetCC.getOperand(0) != SelectOp1)
31907 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31909 if ((CC == ISD::SETLT) &&
31910 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31911 SplatVal.isOneValue()) ||
31912 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31915 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31916 if ((CC == ISD::SETGT) &&
31917 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31918 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31921 // The first operand of the select is the difference between the two input
31923 if (SelectOp1.getOpcode() != ISD::SUB)
31926 Op0 = SelectOp1.getOperand(0);
31927 Op1 = SelectOp1.getOperand(1);
31929 // Check if the operands of the sub are zero-extended from vectors of i8.
31930 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31931 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31932 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31933 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31939 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31941 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31942 const SDValue &Zext1, const SDLoc &DL,
31943 const X86Subtarget &Subtarget) {
31944 // Find the appropriate width for the PSADBW.
31945 EVT InVT = Zext0.getOperand(0).getValueType();
31946 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31948 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31949 // fill in the missing vector elements with 0.
31950 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31951 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31952 Ops[0] = Zext0.getOperand(0);
31953 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31954 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31955 Ops[0] = Zext1.getOperand(0);
31956 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31958 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31959 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31960 ArrayRef<SDValue> Ops) {
31961 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31962 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31964 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31965 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31969 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31971 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31972 const X86Subtarget &Subtarget) {
31973 // Bail without SSE41.
31974 if (!Subtarget.hasSSE41())
31977 EVT ExtractVT = Extract->getValueType(0);
31978 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31981 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31983 SDValue Src = matchBinOpReduction(
31984 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31988 EVT SrcVT = Src.getValueType();
31989 EVT SrcSVT = SrcVT.getScalarType();
31990 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
31994 SDValue MinPos = Src;
31996 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
31997 while (SrcVT.getSizeInBits() > 128) {
31998 unsigned NumElts = SrcVT.getVectorNumElements();
31999 unsigned NumSubElts = NumElts / 2;
32000 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
32001 unsigned SubSizeInBits = SrcVT.getSizeInBits();
32002 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
32003 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
32004 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
32006 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
32007 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
32008 "Unexpected value type");
32010 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
32011 // to flip the value accordingly.
32013 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
32014 if (BinOp == ISD::SMAX)
32015 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
32016 else if (BinOp == ISD::SMIN)
32017 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
32018 else if (BinOp == ISD::UMAX)
32019 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
32022 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32024 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
32025 // shuffling each upper element down and insert zeros. This means that the
32026 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
32027 // ready for the PHMINPOS.
32028 if (ExtractVT == MVT::i8) {
32029 SDValue Upper = DAG.getVectorShuffle(
32030 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
32031 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
32032 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
32035 // Perform the PHMINPOS on a v8i16 vector,
32036 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
32037 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
32038 MinPos = DAG.getBitcast(SrcVT, MinPos);
32041 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32043 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
32044 DAG.getIntPtrConstant(0, DL));
32047 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
32048 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
32050 const X86Subtarget &Subtarget) {
32051 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
32052 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
32055 EVT ExtractVT = Extract->getValueType(0);
32056 unsigned BitWidth = ExtractVT.getSizeInBits();
32057 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
32058 ExtractVT != MVT::i8)
32061 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
32062 unsigned BinOp = 0;
32063 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
32067 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
32068 // which we can't support here for now.
32069 if (Match.getScalarValueSizeInBits() != BitWidth)
32072 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
32073 unsigned MatchSizeInBits = Match.getValueSizeInBits();
32074 if (!(MatchSizeInBits == 128 ||
32075 (MatchSizeInBits == 256 &&
32076 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
32079 // Don't bother performing this for 2-element vectors.
32080 if (Match.getValueType().getVectorNumElements() <= 2)
32083 // Check that we are extracting a reduction of all sign bits.
32084 if (DAG.ComputeNumSignBits(Match) != BitWidth)
32087 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
32089 if (64 == BitWidth || 32 == BitWidth)
32090 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
32091 MatchSizeInBits / BitWidth);
32093 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
32096 ISD::CondCode CondCode;
32097 if (BinOp == ISD::OR) {
32098 // any_of -> MOVMSK != 0
32099 CompareBits = APInt::getNullValue(32);
32100 CondCode = ISD::CondCode::SETNE;
32102 // all_of -> MOVMSK == ((1 << NumElts) - 1)
32103 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
32104 CondCode = ISD::CondCode::SETEQ;
32107 // Perform the select as i32/i64 and then truncate to avoid partial register
32109 unsigned ResWidth = std::max(BitWidth, 32u);
32110 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
32112 SDValue Zero = DAG.getConstant(0, DL, ResVT);
32113 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
32114 SDValue Res = DAG.getBitcast(MaskVT, Match);
32115 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
32116 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
32117 Ones, Zero, CondCode);
32118 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
32121 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
32122 const X86Subtarget &Subtarget) {
32123 // PSADBW is only supported on SSE2 and up.
32124 if (!Subtarget.hasSSE2())
32127 // Verify the type we're extracting from is any integer type above i16.
32128 EVT VT = Extract->getOperand(0).getValueType();
32129 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
32132 unsigned RegSize = 128;
32133 if (Subtarget.useBWIRegs())
32135 else if (Subtarget.hasAVX())
32138 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
32139 // TODO: We should be able to handle larger vectors by splitting them before
32140 // feeding them into several SADs, and then reducing over those.
32141 if (RegSize / VT.getVectorNumElements() < 8)
32144 // Match shuffle + add pyramid.
32145 unsigned BinOp = 0;
32146 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
32148 // The operand is expected to be zero extended from i8
32149 // (verified in detectZextAbsDiff).
32150 // In order to convert to i64 and above, additional any/zero/sign
32151 // extend is expected.
32152 // The zero extend from 32 bit has no mathematical effect on the result.
32153 // Also the sign extend is basically zero extend
32154 // (extends the sign bit which is zero).
32155 // So it is correct to skip the sign/zero extend instruction.
32156 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
32157 Root.getOpcode() == ISD::ZERO_EXTEND ||
32158 Root.getOpcode() == ISD::ANY_EXTEND))
32159 Root = Root.getOperand(0);
32161 // If there was a match, we want Root to be a select that is the root of an
32162 // abs-diff pattern.
32163 if (!Root || (Root.getOpcode() != ISD::VSELECT))
32166 // Check whether we have an abs-diff pattern feeding into the select.
32167 SDValue Zext0, Zext1;
32168 if (!detectZextAbsDiff(Root, Zext0, Zext1))
32171 // Create the SAD instruction.
32173 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
32175 // If the original vector was wider than 8 elements, sum over the results
32176 // in the SAD vector.
32177 unsigned Stages = Log2_32(VT.getVectorNumElements());
32178 MVT SadVT = SAD.getSimpleValueType();
32180 unsigned SadElems = SadVT.getVectorNumElements();
32182 for(unsigned i = Stages - 3; i > 0; --i) {
32183 SmallVector<int, 16> Mask(SadElems, -1);
32184 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
32185 Mask[j] = MaskEnd + j;
32188 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
32189 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
32193 MVT Type = Extract->getSimpleValueType(0);
32194 unsigned TypeSizeInBits = Type.getSizeInBits();
32195 // Return the lowest TypeSizeInBits bits.
32196 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
32197 SAD = DAG.getBitcast(ResVT, SAD);
32198 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
32199 Extract->getOperand(1));
32202 // Attempt to peek through a target shuffle and extract the scalar from the
32204 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
32205 TargetLowering::DAGCombinerInfo &DCI,
32206 const X86Subtarget &Subtarget) {
32207 if (DCI.isBeforeLegalizeOps())
32210 SDValue Src = N->getOperand(0);
32211 SDValue Idx = N->getOperand(1);
32213 EVT VT = N->getValueType(0);
32214 EVT SrcVT = Src.getValueType();
32215 EVT SrcSVT = SrcVT.getVectorElementType();
32216 unsigned NumSrcElts = SrcVT.getVectorNumElements();
32218 // Don't attempt this for boolean mask vectors or unknown extraction indices.
32219 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
32222 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
32223 if (X86ISD::VBROADCAST == Src.getOpcode() &&
32224 Src.getOperand(0).getValueType() == VT)
32225 return Src.getOperand(0);
32227 // Resolve the target shuffle inputs and mask.
32228 SmallVector<int, 16> Mask;
32229 SmallVector<SDValue, 2> Ops;
32230 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
32233 // Attempt to narrow/widen the shuffle mask to the correct size.
32234 if (Mask.size() != NumSrcElts) {
32235 if ((NumSrcElts % Mask.size()) == 0) {
32236 SmallVector<int, 16> ScaledMask;
32237 int Scale = NumSrcElts / Mask.size();
32238 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
32239 Mask = std::move(ScaledMask);
32240 } else if ((Mask.size() % NumSrcElts) == 0) {
32241 SmallVector<int, 16> WidenedMask;
32242 while (Mask.size() > NumSrcElts &&
32243 canWidenShuffleElements(Mask, WidenedMask))
32244 Mask = std::move(WidenedMask);
32245 // TODO - investigate support for wider shuffle masks with known upper
32246 // undef/zero elements for implicit zero-extension.
32250 // Check if narrowing/widening failed.
32251 if (Mask.size() != NumSrcElts)
32254 int SrcIdx = Mask[N->getConstantOperandVal(1)];
32257 // If the shuffle source element is undef/zero then we can just accept it.
32258 if (SrcIdx == SM_SentinelUndef)
32259 return DAG.getUNDEF(VT);
32261 if (SrcIdx == SM_SentinelZero)
32262 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
32263 : DAG.getConstant(0, dl, VT);
32265 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
32266 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
32267 SrcIdx = SrcIdx % Mask.size();
32269 // We can only extract other elements from 128-bit vectors and in certain
32270 // circumstances, depending on SSE-level.
32271 // TODO: Investigate using extract_subvector for larger vectors.
32272 // TODO: Investigate float/double extraction if it will be just stored.
32273 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
32274 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
32275 assert(SrcSVT == VT && "Unexpected extraction type");
32276 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
32277 DAG.getIntPtrConstant(SrcIdx, dl));
32280 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
32281 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
32282 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
32283 "Unexpected extraction type");
32284 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
32285 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
32286 DAG.getIntPtrConstant(SrcIdx, dl));
32287 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
32293 /// Detect vector gather/scatter index generation and convert it from being a
32294 /// bunch of shuffles and extracts into a somewhat faster sequence.
32295 /// For i686, the best sequence is apparently storing the value and loading
32296 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
32297 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
32298 TargetLowering::DAGCombinerInfo &DCI,
32299 const X86Subtarget &Subtarget) {
32300 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
32303 // TODO - Remove this once we can handle the implicit zero-extension of
32304 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
32305 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
32306 // combineBasicSADPattern.
32307 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
32310 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
32313 SDValue InputVector = N->getOperand(0);
32314 SDValue EltIdx = N->getOperand(1);
32316 EVT SrcVT = InputVector.getValueType();
32317 EVT VT = N->getValueType(0);
32318 SDLoc dl(InputVector);
32320 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
32321 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32322 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
32323 SDValue MMXSrc = InputVector.getOperand(0);
32325 // The bitcast source is a direct mmx result.
32326 if (MMXSrc.getValueType() == MVT::x86mmx)
32327 return DAG.getBitcast(VT, InputVector);
32330 // Detect mmx to i32 conversion through a v2i32 elt extract.
32331 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32332 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
32333 SDValue MMXSrc = InputVector.getOperand(0);
32335 // The bitcast source is a direct mmx result.
32336 if (MMXSrc.getValueType() == MVT::x86mmx)
32337 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
32340 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
32341 isa<ConstantSDNode>(EltIdx) &&
32342 isa<ConstantSDNode>(InputVector.getOperand(0))) {
32343 uint64_t ExtractedElt = N->getConstantOperandVal(1);
32344 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
32345 const APInt &InputValue = InputC->getAPIntValue();
32346 uint64_t Res = InputValue[ExtractedElt];
32347 return DAG.getConstant(Res, dl, MVT::i1);
32350 // Check whether this extract is the root of a sum of absolute differences
32351 // pattern. This has to be done here because we really want it to happen
32352 // pre-legalization,
32353 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
32356 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
32357 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
32360 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
32361 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
32367 /// If a vector select has an operand that is -1 or 0, try to simplify the
32368 /// select to a bitwise logic operation.
32369 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
32371 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
32372 TargetLowering::DAGCombinerInfo &DCI,
32373 const X86Subtarget &Subtarget) {
32374 SDValue Cond = N->getOperand(0);
32375 SDValue LHS = N->getOperand(1);
32376 SDValue RHS = N->getOperand(2);
32377 EVT VT = LHS.getValueType();
32378 EVT CondVT = Cond.getValueType();
32380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32382 if (N->getOpcode() != ISD::VSELECT)
32385 assert(CondVT.isVector() && "Vector select expects a vector selector!");
32387 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
32388 // Check if the first operand is all zeros and Cond type is vXi1.
32389 // This situation only applies to avx512.
32390 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
32391 CondVT.getVectorElementType() == MVT::i1) {
32392 // Invert the cond to not(cond) : xor(op,allones)=not(op)
32393 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
32394 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
32395 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
32398 // To use the condition operand as a bitwise mask, it must have elements that
32399 // are the same size as the select elements. Ie, the condition operand must
32400 // have already been promoted from the IR select condition type <N x i1>.
32401 // Don't check if the types themselves are equal because that excludes
32402 // vector floating-point selects.
32403 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
32406 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
32407 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
32409 // Try to invert the condition if true value is not all 1s and false value is
32411 if (!TValIsAllOnes && !FValIsAllZeros &&
32412 // Check if the selector will be produced by CMPP*/PCMP*.
32413 Cond.getOpcode() == ISD::SETCC &&
32414 // Check if SETCC has already been promoted.
32415 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
32417 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
32419 if (TValIsAllZeros || FValIsAllOnes) {
32420 SDValue CC = Cond.getOperand(2);
32421 ISD::CondCode NewCC =
32422 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
32423 Cond.getOperand(0).getValueType().isInteger());
32424 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
32426 std::swap(LHS, RHS);
32427 TValIsAllOnes = FValIsAllOnes;
32428 FValIsAllZeros = TValIsAllZeros;
32432 // Cond value must be 'sign splat' to be converted to a logical op.
32433 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
32436 // vselect Cond, 111..., 000... -> Cond
32437 if (TValIsAllOnes && FValIsAllZeros)
32438 return DAG.getBitcast(VT, Cond);
32440 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
32443 // vselect Cond, 111..., X -> or Cond, X
32444 if (TValIsAllOnes) {
32445 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
32446 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
32447 return DAG.getBitcast(VT, Or);
32450 // vselect Cond, X, 000... -> and Cond, X
32451 if (FValIsAllZeros) {
32452 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
32453 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
32454 return DAG.getBitcast(VT, And);
32457 // vselect Cond, 000..., X -> andn Cond, X
32458 if (TValIsAllZeros) {
32459 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
32460 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
32461 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
32462 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
32463 return DAG.getBitcast(VT, AndN);
32469 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
32470 SDValue Cond = N->getOperand(0);
32471 SDValue LHS = N->getOperand(1);
32472 SDValue RHS = N->getOperand(2);
32475 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
32476 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
32477 if (!TrueC || !FalseC)
32480 // Don't do this for crazy integer types.
32481 EVT VT = N->getValueType(0);
32482 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32485 // We're going to use the condition bit in math or logic ops. We could allow
32486 // this with a wider condition value (post-legalization it becomes an i8),
32487 // but if nothing is creating selects that late, it doesn't matter.
32488 if (Cond.getValueType() != MVT::i1)
32491 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
32492 // 3, 5, or 9 with i32/i64, so those get transformed too.
32493 // TODO: For constants that overflow or do not differ by power-of-2 or small
32494 // multiplier, convert to 'and' + 'add'.
32495 const APInt &TrueVal = TrueC->getAPIntValue();
32496 const APInt &FalseVal = FalseC->getAPIntValue();
32498 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
32502 APInt AbsDiff = Diff.abs();
32503 if (AbsDiff.isPowerOf2() ||
32504 ((VT == MVT::i32 || VT == MVT::i64) &&
32505 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
32507 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
32508 // of the condition can usually be folded into a compare predicate, but even
32509 // without that, the sequence should be cheaper than a CMOV alternative.
32510 if (TrueVal.slt(FalseVal)) {
32511 Cond = DAG.getNOT(DL, Cond, MVT::i1);
32512 std::swap(TrueC, FalseC);
32515 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
32516 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
32518 // Multiply condition by the difference if non-one.
32519 if (!AbsDiff.isOneValue())
32520 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
32522 // Add the base if non-zero.
32523 if (!FalseC->isNullValue())
32524 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
32532 /// If this is a *dynamic* select (non-constant condition) and we can match
32533 /// this node with one of the variable blend instructions, restructure the
32534 /// condition so that blends can use the high (sign) bit of each element.
32535 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
32536 TargetLowering::DAGCombinerInfo &DCI,
32537 const X86Subtarget &Subtarget) {
32538 SDValue Cond = N->getOperand(0);
32539 if (N->getOpcode() != ISD::VSELECT ||
32540 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
32543 // Don't optimize before the condition has been transformed to a legal type
32544 // and don't ever optimize vector selects that map to AVX512 mask-registers.
32545 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32546 if (BitWidth < 8 || BitWidth > 64)
32549 // We can only handle the cases where VSELECT is directly legal on the
32550 // subtarget. We custom lower VSELECT nodes with constant conditions and
32551 // this makes it hard to see whether a dynamic VSELECT will correctly
32552 // lower, so we both check the operation's status and explicitly handle the
32553 // cases where a *dynamic* blend will fail even though a constant-condition
32554 // blend could be custom lowered.
32555 // FIXME: We should find a better way to handle this class of problems.
32556 // Potentially, we should combine constant-condition vselect nodes
32557 // pre-legalization into shuffles and not mark as many types as custom
32559 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32560 EVT VT = N->getValueType(0);
32561 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32563 // FIXME: We don't support i16-element blends currently. We could and
32564 // should support them by making *all* the bits in the condition be set
32565 // rather than just the high bit and using an i8-element blend.
32566 if (VT.getVectorElementType() == MVT::i16)
32568 // Dynamic blending was only available from SSE4.1 onward.
32569 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32571 // Byte blends are only available in AVX2
32572 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32574 // There are no 512-bit blend instructions that use sign bits.
32575 if (VT.is512BitVector())
32578 // TODO: Add other opcodes eventually lowered into BLEND.
32579 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32581 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
32584 APInt DemandedMask(APInt::getSignMask(BitWidth));
32586 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32587 !DCI.isBeforeLegalizeOps());
32588 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
32591 // If we changed the computation somewhere in the DAG, this change will
32592 // affect all users of Cond. Update all the nodes so that we do not use
32593 // the generic VSELECT anymore. Otherwise, we may perform wrong
32594 // optimizations as we messed with the actual expectation for the vector
32596 for (SDNode *U : Cond->uses()) {
32597 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
32598 Cond, U->getOperand(1), U->getOperand(2));
32599 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32601 DCI.CommitTargetLoweringOpt(TLO);
32602 return SDValue(N, 0);
32605 /// Do target-specific dag combines on SELECT and VSELECT nodes.
32606 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
32607 TargetLowering::DAGCombinerInfo &DCI,
32608 const X86Subtarget &Subtarget) {
32610 SDValue Cond = N->getOperand(0);
32611 // Get the LHS/RHS of the select.
32612 SDValue LHS = N->getOperand(1);
32613 SDValue RHS = N->getOperand(2);
32614 EVT VT = LHS.getValueType();
32615 EVT CondVT = Cond.getValueType();
32616 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32618 // Convert vselects with constant condition into shuffles.
32619 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
32620 DCI.isBeforeLegalizeOps()) {
32621 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
32622 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
32623 SDValue CondElt = Cond->getOperand(i);
32625 // Arbitrarily choose from the 2nd operand if the select condition element
32627 // TODO: Can we do better by matching patterns such as even/odd?
32628 if (CondElt.isUndef() || isNullConstant(CondElt))
32632 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
32635 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
32636 // instructions match the semantics of the common C idiom x<y?x:y but not
32637 // x<=y?x:y, because of how they handle negative zero (which can be
32638 // ignored in unsafe-math mode).
32639 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
32640 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
32641 VT != MVT::f80 && VT != MVT::f128 &&
32642 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
32643 (Subtarget.hasSSE2() ||
32644 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
32645 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32647 unsigned Opcode = 0;
32648 // Check for x CC y ? x : y.
32649 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32650 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32654 // Converting this to a min would handle NaNs incorrectly, and swapping
32655 // the operands would cause it to handle comparisons between positive
32656 // and negative zero incorrectly.
32657 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32658 if (!DAG.getTarget().Options.UnsafeFPMath &&
32659 !(DAG.isKnownNeverZeroFloat(LHS) ||
32660 DAG.isKnownNeverZeroFloat(RHS)))
32662 std::swap(LHS, RHS);
32664 Opcode = X86ISD::FMIN;
32667 // Converting this to a min would handle comparisons between positive
32668 // and negative zero incorrectly.
32669 if (!DAG.getTarget().Options.UnsafeFPMath &&
32670 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32672 Opcode = X86ISD::FMIN;
32675 // Converting this to a min would handle both negative zeros and NaNs
32676 // incorrectly, but we can swap the operands to fix both.
32677 std::swap(LHS, RHS);
32682 Opcode = X86ISD::FMIN;
32686 // Converting this to a max would handle comparisons between positive
32687 // and negative zero incorrectly.
32688 if (!DAG.getTarget().Options.UnsafeFPMath &&
32689 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32691 Opcode = X86ISD::FMAX;
32694 // Converting this to a max would handle NaNs incorrectly, and swapping
32695 // the operands would cause it to handle comparisons between positive
32696 // and negative zero incorrectly.
32697 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32698 if (!DAG.getTarget().Options.UnsafeFPMath &&
32699 !(DAG.isKnownNeverZeroFloat(LHS) ||
32700 DAG.isKnownNeverZeroFloat(RHS)))
32702 std::swap(LHS, RHS);
32704 Opcode = X86ISD::FMAX;
32707 // Converting this to a max would handle both negative zeros and NaNs
32708 // incorrectly, but we can swap the operands to fix both.
32709 std::swap(LHS, RHS);
32714 Opcode = X86ISD::FMAX;
32717 // Check for x CC y ? y : x -- a min/max with reversed arms.
32718 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32719 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32723 // Converting this to a min would handle comparisons between positive
32724 // and negative zero incorrectly, and swapping the operands would
32725 // cause it to handle NaNs incorrectly.
32726 if (!DAG.getTarget().Options.UnsafeFPMath &&
32727 !(DAG.isKnownNeverZeroFloat(LHS) ||
32728 DAG.isKnownNeverZeroFloat(RHS))) {
32729 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32731 std::swap(LHS, RHS);
32733 Opcode = X86ISD::FMIN;
32736 // Converting this to a min would handle NaNs incorrectly.
32737 if (!DAG.getTarget().Options.UnsafeFPMath &&
32738 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32740 Opcode = X86ISD::FMIN;
32743 // Converting this to a min would handle both negative zeros and NaNs
32744 // incorrectly, but we can swap the operands to fix both.
32745 std::swap(LHS, RHS);
32750 Opcode = X86ISD::FMIN;
32754 // Converting this to a max would handle NaNs incorrectly.
32755 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32757 Opcode = X86ISD::FMAX;
32760 // Converting this to a max would handle comparisons between positive
32761 // and negative zero incorrectly, and swapping the operands would
32762 // cause it to handle NaNs incorrectly.
32763 if (!DAG.getTarget().Options.UnsafeFPMath &&
32764 !DAG.isKnownNeverZeroFloat(LHS) &&
32765 !DAG.isKnownNeverZeroFloat(RHS)) {
32766 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32768 std::swap(LHS, RHS);
32770 Opcode = X86ISD::FMAX;
32773 // Converting this to a max would handle both negative zeros and NaNs
32774 // incorrectly, but we can swap the operands to fix both.
32775 std::swap(LHS, RHS);
32780 Opcode = X86ISD::FMAX;
32786 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32789 // Some mask scalar intrinsics rely on checking if only one bit is set
32790 // and implement it in C code like this:
32791 // A[0] = (U & 1) ? A[0] : W[0];
32792 // This creates some redundant instructions that break pattern matching.
32793 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
32794 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
32795 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
32796 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32797 SDValue AndNode = Cond.getOperand(0);
32798 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
32799 isNullConstant(Cond.getOperand(1)) &&
32800 isOneConstant(AndNode.getOperand(1))) {
32801 // LHS and RHS swapped due to
32802 // setcc outputting 1 when AND resulted in 0 and vice versa.
32803 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
32804 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
32808 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32809 // lowering on KNL. In this case we convert it to
32810 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32811 // The same situation all vectors of i8 and i16 without BWI.
32812 // Make sure we extend these even before type legalization gets a chance to
32813 // split wide vectors.
32814 // Since SKX these selects have a proper lowering.
32815 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32816 CondVT.getVectorElementType() == MVT::i1 &&
32817 VT.getVectorNumElements() > 4 &&
32818 (VT.getVectorElementType() == MVT::i8 ||
32819 VT.getVectorElementType() == MVT::i16)) {
32820 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32821 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32824 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32827 // Canonicalize max and min:
32828 // (x > y) ? x : y -> (x >= y) ? x : y
32829 // (x < y) ? x : y -> (x <= y) ? x : y
32830 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32831 // the need for an extra compare
32832 // against zero. e.g.
32833 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32835 // testl %edi, %edi
32837 // cmovgl %edi, %eax
32841 // cmovsl %eax, %edi
32842 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32843 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32844 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32845 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32850 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32851 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32852 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32853 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32858 // Early exit check
32859 if (!TLI.isTypeLegal(VT))
32862 // Match VSELECTs into subs with unsigned saturation.
32863 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32864 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32865 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32866 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32867 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32869 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32870 // left side invert the predicate to simplify logic below.
32872 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32874 CC = ISD::getSetCCInverse(CC, true);
32875 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32879 if (Other.getNode() && Other->getNumOperands() == 2 &&
32880 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32881 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32882 SDValue CondRHS = Cond->getOperand(1);
32884 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32885 ArrayRef<SDValue> Ops) {
32886 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32889 // Look for a general sub with unsigned saturation first.
32890 // x >= y ? x-y : 0 --> subus x, y
32891 // x > y ? x-y : 0 --> subus x, y
32892 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32893 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32894 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32897 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32898 if (isa<BuildVectorSDNode>(CondRHS)) {
32899 // If the RHS is a constant we have to reverse the const
32900 // canonicalization.
32901 // x > C-1 ? x+-C : 0 --> subus x, C
32902 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32903 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32905 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32906 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32907 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32908 DAG.getConstant(0, DL, VT), OpRHS);
32909 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32913 // Another special case: If C was a sign bit, the sub has been
32914 // canonicalized into a xor.
32915 // FIXME: Would it be better to use computeKnownBits to determine
32916 // whether it's safe to decanonicalize the xor?
32917 // x s< 0 ? x^C : 0 --> subus x, C
32918 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32919 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32920 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32921 OpRHSConst->getAPIntValue().isSignMask()) {
32922 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32923 // Note that we have to rebuild the RHS constant here to ensure we
32924 // don't rely on particular values of undef lanes.
32925 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32932 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32935 if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
32938 // Custom action for SELECT MMX
32939 if (VT == MVT::x86mmx) {
32940 LHS = DAG.getBitcast(MVT::i64, LHS);
32941 RHS = DAG.getBitcast(MVT::i64, RHS);
32942 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32943 return DAG.getBitcast(VT, newSelect);
32950 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32952 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32953 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32954 /// Note that this is only legal for some op/cc combinations.
32955 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32957 const X86Subtarget &Subtarget) {
32958 // This combine only operates on CMP-like nodes.
32959 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32960 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32963 // Can't replace the cmp if it has more uses than the one we're looking at.
32964 // FIXME: We would like to be able to handle this, but would need to make sure
32965 // all uses were updated.
32966 if (!Cmp.hasOneUse())
32969 // This only applies to variations of the common case:
32970 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32971 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32972 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32973 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32974 // Using the proper condcodes (see below), overflow is checked for.
32976 // FIXME: We can generalize both constraints:
32977 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32979 // if the result is compared.
32981 SDValue CmpLHS = Cmp.getOperand(0);
32982 SDValue CmpRHS = Cmp.getOperand(1);
32984 if (!CmpLHS.hasOneUse())
32987 unsigned Opc = CmpLHS.getOpcode();
32988 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32991 SDValue OpRHS = CmpLHS.getOperand(2);
32992 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
32996 APInt Addend = OpRHSC->getAPIntValue();
32997 if (Opc == ISD::ATOMIC_LOAD_SUB)
33000 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
33004 APInt Comparison = CmpRHSC->getAPIntValue();
33006 // If the addend is the negation of the comparison value, then we can do
33007 // a full comparison by emitting the atomic arithmetic as a locked sub.
33008 if (Comparison == -Addend) {
33009 // The CC is fine, but we need to rewrite the LHS of the comparison as an
33011 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
33012 auto AtomicSub = DAG.getAtomic(
33013 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
33014 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
33015 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
33016 AN->getMemOperand());
33017 // If the comparision uses the CF flag we can't use INC/DEC instructions.
33018 bool NeedCF = false;
33021 case X86::COND_A: case X86::COND_AE:
33022 case X86::COND_B: case X86::COND_BE:
33026 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
33027 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33028 DAG.getUNDEF(CmpLHS.getValueType()));
33029 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33033 // We can handle comparisons with zero in a number of cases by manipulating
33035 if (!Comparison.isNullValue())
33038 if (CC == X86::COND_S && Addend == 1)
33040 else if (CC == X86::COND_NS && Addend == 1)
33042 else if (CC == X86::COND_G && Addend == -1)
33044 else if (CC == X86::COND_LE && Addend == -1)
33049 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
33050 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33051 DAG.getUNDEF(CmpLHS.getValueType()));
33052 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33056 // Check whether a boolean test is testing a boolean value generated by
33057 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
33060 // Simplify the following patterns:
33061 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
33062 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
33063 // to (Op EFLAGS Cond)
33065 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
33066 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
33067 // to (Op EFLAGS !Cond)
33069 // where Op could be BRCOND or CMOV.
33071 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
33072 // This combine only operates on CMP-like nodes.
33073 if (!(Cmp.getOpcode() == X86ISD::CMP ||
33074 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
33077 // Quit if not used as a boolean value.
33078 if (CC != X86::COND_E && CC != X86::COND_NE)
33081 // Check CMP operands. One of them should be 0 or 1 and the other should be
33082 // an SetCC or extended from it.
33083 SDValue Op1 = Cmp.getOperand(0);
33084 SDValue Op2 = Cmp.getOperand(1);
33087 const ConstantSDNode* C = nullptr;
33088 bool needOppositeCond = (CC == X86::COND_E);
33089 bool checkAgainstTrue = false; // Is it a comparison against 1?
33091 if ((C = dyn_cast<ConstantSDNode>(Op1)))
33093 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
33095 else // Quit if all operands are not constants.
33098 if (C->getZExtValue() == 1) {
33099 needOppositeCond = !needOppositeCond;
33100 checkAgainstTrue = true;
33101 } else if (C->getZExtValue() != 0)
33102 // Quit if the constant is neither 0 or 1.
33105 bool truncatedToBoolWithAnd = false;
33106 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
33107 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
33108 SetCC.getOpcode() == ISD::TRUNCATE ||
33109 SetCC.getOpcode() == ISD::AND) {
33110 if (SetCC.getOpcode() == ISD::AND) {
33112 if (isOneConstant(SetCC.getOperand(0)))
33114 if (isOneConstant(SetCC.getOperand(1)))
33118 SetCC = SetCC.getOperand(OpIdx);
33119 truncatedToBoolWithAnd = true;
33121 SetCC = SetCC.getOperand(0);
33124 switch (SetCC.getOpcode()) {
33125 case X86ISD::SETCC_CARRY:
33126 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
33127 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
33128 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
33129 // truncated to i1 using 'and'.
33130 if (checkAgainstTrue && !truncatedToBoolWithAnd)
33132 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
33133 "Invalid use of SETCC_CARRY!");
33135 case X86ISD::SETCC:
33136 // Set the condition code or opposite one if necessary.
33137 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
33138 if (needOppositeCond)
33139 CC = X86::GetOppositeBranchCondition(CC);
33140 return SetCC.getOperand(1);
33141 case X86ISD::CMOV: {
33142 // Check whether false/true value has canonical one, i.e. 0 or 1.
33143 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
33144 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
33145 // Quit if true value is not a constant.
33148 // Quit if false value is not a constant.
33150 SDValue Op = SetCC.getOperand(0);
33151 // Skip 'zext' or 'trunc' node.
33152 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
33153 Op.getOpcode() == ISD::TRUNCATE)
33154 Op = Op.getOperand(0);
33155 // A special case for rdrand/rdseed, where 0 is set if false cond is
33157 if ((Op.getOpcode() != X86ISD::RDRAND &&
33158 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
33161 // Quit if false value is not the constant 0 or 1.
33162 bool FValIsFalse = true;
33163 if (FVal && FVal->getZExtValue() != 0) {
33164 if (FVal->getZExtValue() != 1)
33166 // If FVal is 1, opposite cond is needed.
33167 needOppositeCond = !needOppositeCond;
33168 FValIsFalse = false;
33170 // Quit if TVal is not the constant opposite of FVal.
33171 if (FValIsFalse && TVal->getZExtValue() != 1)
33173 if (!FValIsFalse && TVal->getZExtValue() != 0)
33175 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
33176 if (needOppositeCond)
33177 CC = X86::GetOppositeBranchCondition(CC);
33178 return SetCC.getOperand(3);
33185 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
33187 /// (X86or (X86setcc) (X86setcc))
33188 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
33189 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
33190 X86::CondCode &CC1, SDValue &Flags,
33192 if (Cond->getOpcode() == X86ISD::CMP) {
33193 if (!isNullConstant(Cond->getOperand(1)))
33196 Cond = Cond->getOperand(0);
33201 SDValue SetCC0, SetCC1;
33202 switch (Cond->getOpcode()) {
33203 default: return false;
33210 SetCC0 = Cond->getOperand(0);
33211 SetCC1 = Cond->getOperand(1);
33215 // Make sure we have SETCC nodes, using the same flags value.
33216 if (SetCC0.getOpcode() != X86ISD::SETCC ||
33217 SetCC1.getOpcode() != X86ISD::SETCC ||
33218 SetCC0->getOperand(1) != SetCC1->getOperand(1))
33221 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
33222 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
33223 Flags = SetCC0->getOperand(1);
33227 // When legalizing carry, we create carries via add X, -1
33228 // If that comes from an actual carry, via setcc, we use the
33230 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
33231 if (EFLAGS.getOpcode() == X86ISD::ADD) {
33232 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
33233 SDValue Carry = EFLAGS.getOperand(0);
33234 while (Carry.getOpcode() == ISD::TRUNCATE ||
33235 Carry.getOpcode() == ISD::ZERO_EXTEND ||
33236 Carry.getOpcode() == ISD::SIGN_EXTEND ||
33237 Carry.getOpcode() == ISD::ANY_EXTEND ||
33238 (Carry.getOpcode() == ISD::AND &&
33239 isOneConstant(Carry.getOperand(1))))
33240 Carry = Carry.getOperand(0);
33241 if (Carry.getOpcode() == X86ISD::SETCC ||
33242 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
33243 if (Carry.getConstantOperandVal(0) == X86::COND_B)
33244 return Carry.getOperand(1);
33252 /// Optimize an EFLAGS definition used according to the condition code \p CC
33253 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
33254 /// uses of chain values.
33255 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
33257 const X86Subtarget &Subtarget) {
33258 if (CC == X86::COND_B)
33259 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
33262 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
33264 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
33267 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
33268 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
33269 TargetLowering::DAGCombinerInfo &DCI,
33270 const X86Subtarget &Subtarget) {
33273 SDValue FalseOp = N->getOperand(0);
33274 SDValue TrueOp = N->getOperand(1);
33275 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
33276 SDValue Cond = N->getOperand(3);
33278 // Try to simplify the EFLAGS and condition code operands.
33279 // We can't always do this as FCMOV only supports a subset of X86 cond.
33280 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
33281 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
33282 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
33284 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33288 // If this is a select between two integer constants, try to do some
33289 // optimizations. Note that the operands are ordered the opposite of SELECT
33291 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
33292 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
33293 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
33294 // larger than FalseC (the false value).
33295 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
33296 CC = X86::GetOppositeBranchCondition(CC);
33297 std::swap(TrueC, FalseC);
33298 std::swap(TrueOp, FalseOp);
33301 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
33302 // This is efficient for any integer data type (including i8/i16) and
33304 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
33305 Cond = getSETCC(CC, Cond, DL, DAG);
33307 // Zero extend the condition if needed.
33308 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
33310 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
33311 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
33312 DAG.getConstant(ShAmt, DL, MVT::i8));
33316 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
33317 // for any integer data type, including i8/i16.
33318 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
33319 Cond = getSETCC(CC, Cond, DL, DAG);
33321 // Zero extend the condition if needed.
33322 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
33323 FalseC->getValueType(0), Cond);
33324 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33325 SDValue(FalseC, 0));
33329 // Optimize cases that will turn into an LEA instruction. This requires
33330 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
33331 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
33332 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
33333 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
33335 bool isFastMultiplier = false;
33337 switch ((unsigned char)Diff) {
33339 case 1: // result = add base, cond
33340 case 2: // result = lea base( , cond*2)
33341 case 3: // result = lea base(cond, cond*2)
33342 case 4: // result = lea base( , cond*4)
33343 case 5: // result = lea base(cond, cond*4)
33344 case 8: // result = lea base( , cond*8)
33345 case 9: // result = lea base(cond, cond*8)
33346 isFastMultiplier = true;
33351 if (isFastMultiplier) {
33352 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
33353 Cond = getSETCC(CC, Cond, DL ,DAG);
33354 // Zero extend the condition if needed.
33355 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
33357 // Scale the condition by the difference.
33359 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
33360 DAG.getConstant(Diff, DL, Cond.getValueType()));
33362 // Add the base if non-zero.
33363 if (FalseC->getAPIntValue() != 0)
33364 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33365 SDValue(FalseC, 0));
33372 // Handle these cases:
33373 // (select (x != c), e, c) -> select (x != c), e, x),
33374 // (select (x == c), c, e) -> select (x == c), x, e)
33375 // where the c is an integer constant, and the "select" is the combination
33376 // of CMOV and CMP.
33378 // The rationale for this change is that the conditional-move from a constant
33379 // needs two instructions, however, conditional-move from a register needs
33380 // only one instruction.
33382 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
33383 // some instruction-combining opportunities. This opt needs to be
33384 // postponed as late as possible.
33386 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
33387 // the DCI.xxxx conditions are provided to postpone the optimization as
33388 // late as possible.
33390 ConstantSDNode *CmpAgainst = nullptr;
33391 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
33392 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
33393 !isa<ConstantSDNode>(Cond.getOperand(0))) {
33395 if (CC == X86::COND_NE &&
33396 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
33397 CC = X86::GetOppositeBranchCondition(CC);
33398 std::swap(TrueOp, FalseOp);
33401 if (CC == X86::COND_E &&
33402 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
33403 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
33404 DAG.getConstant(CC, DL, MVT::i8), Cond };
33405 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33410 // Fold and/or of setcc's to double CMOV:
33411 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
33412 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
33414 // This combine lets us generate:
33415 // cmovcc1 (jcc1 if we don't have CMOV)
33421 // cmovne (jne if we don't have CMOV)
33422 // When we can't use the CMOV instruction, it might increase branch
33424 // When we can use CMOV, or when there is no mispredict, this improves
33425 // throughput and reduces register pressure.
33427 if (CC == X86::COND_NE) {
33429 X86::CondCode CC0, CC1;
33431 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
33433 std::swap(FalseOp, TrueOp);
33434 CC0 = X86::GetOppositeBranchCondition(CC0);
33435 CC1 = X86::GetOppositeBranchCondition(CC1);
33438 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
33440 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
33441 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
33442 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33447 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
33448 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
33449 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
33450 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
33451 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
33452 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
33453 SDValue Add = TrueOp;
33454 SDValue Const = FalseOp;
33455 // Canonicalize the condition code for easier matching and output.
33456 if (CC == X86::COND_E) {
33457 std::swap(Add, Const);
33461 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
33462 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
33463 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
33464 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
33465 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
33466 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
33467 EVT VT = N->getValueType(0);
33468 // This should constant fold.
33469 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
33470 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
33471 DAG.getConstant(CC, DL, MVT::i8), Cond);
33472 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
33479 /// Different mul shrinking modes.
33480 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
33482 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
33483 EVT VT = N->getOperand(0).getValueType();
33484 if (VT.getScalarSizeInBits() != 32)
33487 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
33488 unsigned SignBits[2] = {1, 1};
33489 bool IsPositive[2] = {false, false};
33490 for (unsigned i = 0; i < 2; i++) {
33491 SDValue Opd = N->getOperand(i);
33493 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
33494 // compute signbits for it separately.
33495 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
33496 // For anyextend, it is safe to assume an appropriate number of leading
33498 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
33500 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
33505 IsPositive[i] = true;
33506 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
33507 // All the operands of BUILD_VECTOR need to be int constant.
33508 // Find the smallest value range which all the operands belong to.
33510 IsPositive[i] = true;
33511 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
33512 if (SubOp.isUndef())
33514 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
33517 APInt IntVal = CN->getAPIntValue();
33518 if (IntVal.isNegative())
33519 IsPositive[i] = false;
33520 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
33523 SignBits[i] = DAG.ComputeNumSignBits(Opd);
33524 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
33525 IsPositive[i] = true;
33529 bool AllPositive = IsPositive[0] && IsPositive[1];
33530 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
33531 // When ranges are from -128 ~ 127, use MULS8 mode.
33532 if (MinSignBits >= 25)
33534 // When ranges are from 0 ~ 255, use MULU8 mode.
33535 else if (AllPositive && MinSignBits >= 24)
33537 // When ranges are from -32768 ~ 32767, use MULS16 mode.
33538 else if (MinSignBits >= 17)
33540 // When ranges are from 0 ~ 65535, use MULU16 mode.
33541 else if (AllPositive && MinSignBits >= 16)
33548 /// When the operands of vector mul are extended from smaller size values,
33549 /// like i8 and i16, the type of mul may be shrinked to generate more
33550 /// efficient code. Two typical patterns are handled:
33552 /// %2 = sext/zext <N x i8> %1 to <N x i32>
33553 /// %4 = sext/zext <N x i8> %3 to <N x i32>
33554 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33555 /// %5 = mul <N x i32> %2, %4
33558 /// %2 = zext/sext <N x i16> %1 to <N x i32>
33559 /// %4 = zext/sext <N x i16> %3 to <N x i32>
33560 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33561 /// %5 = mul <N x i32> %2, %4
33563 /// There are four mul shrinking modes:
33564 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
33565 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
33566 /// generate pmullw+sext32 for it (MULS8 mode).
33567 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
33568 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
33569 /// generate pmullw+zext32 for it (MULU8 mode).
33570 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
33571 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
33572 /// generate pmullw+pmulhw for it (MULS16 mode).
33573 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
33574 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
33575 /// generate pmullw+pmulhuw for it (MULU16 mode).
33576 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
33577 const X86Subtarget &Subtarget) {
33578 // Check for legality
33579 // pmullw/pmulhw are not supported by SSE.
33580 if (!Subtarget.hasSSE2())
33583 // Check for profitability
33584 // pmulld is supported since SSE41. It is better to use pmulld
33585 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
33587 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
33588 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
33592 if (!canReduceVMulWidth(N, DAG, Mode))
33596 SDValue N0 = N->getOperand(0);
33597 SDValue N1 = N->getOperand(1);
33598 EVT VT = N->getOperand(0).getValueType();
33599 unsigned NumElts = VT.getVectorNumElements();
33600 if ((NumElts % 2) != 0)
33603 unsigned RegSize = 128;
33604 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
33605 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
33607 // Shrink the operands of mul.
33608 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
33609 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
33611 if (NumElts >= OpsVT.getVectorNumElements()) {
33612 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
33613 // lower part is needed.
33614 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
33615 if (Mode == MULU8 || Mode == MULS8) {
33616 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
33619 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
33620 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
33621 // the higher part is also needed.
33622 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33623 ReducedVT, NewN0, NewN1);
33625 // Repack the lower part and higher part result of mul into a wider
33627 // Generate shuffle functioning as punpcklwd.
33628 SmallVector<int, 16> ShuffleMask(NumElts);
33629 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33630 ShuffleMask[2 * i] = i;
33631 ShuffleMask[2 * i + 1] = i + NumElts;
33634 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33635 ResLo = DAG.getBitcast(ResVT, ResLo);
33636 // Generate shuffle functioning as punpckhwd.
33637 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33638 ShuffleMask[2 * i] = i + NumElts / 2;
33639 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33642 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33643 ResHi = DAG.getBitcast(ResVT, ResHi);
33644 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33647 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33648 // to legalize the mul explicitly because implicit legalization for type
33649 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33650 // instructions which will not exist when we explicitly legalize it by
33651 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33652 // <4 x i16> undef).
33654 // Legalize the operands of mul.
33655 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33656 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33657 if ((RegSize % ReducedSizeInBits) != 0)
33660 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33661 DAG.getUNDEF(ReducedVT));
33663 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33665 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33667 if (Mode == MULU8 || Mode == MULS8) {
33668 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33670 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33672 // convert the type of mul result to VT.
33673 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33674 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33675 : ISD::SIGN_EXTEND_VECTOR_INREG,
33677 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33678 DAG.getIntPtrConstant(0, DL));
33680 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33681 // MULU16/MULS16, both parts are needed.
33682 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33683 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33684 OpsVT, NewN0, NewN1);
33686 // Repack the lower part and higher part result of mul into a wider
33687 // result. Make sure the type of mul result is VT.
33688 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33689 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33690 Res = DAG.getBitcast(ResVT, Res);
33691 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33692 DAG.getIntPtrConstant(0, DL));
33697 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33698 EVT VT, const SDLoc &DL) {
33700 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33701 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33702 DAG.getConstant(Mult, DL, VT));
33703 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33704 DAG.getConstant(Shift, DL, MVT::i8));
33705 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33710 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
33711 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33712 DAG.getConstant(Mul1, DL, VT));
33713 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
33714 DAG.getConstant(Mul2, DL, VT));
33715 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33724 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33725 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33727 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33728 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33730 // mul x, 41 => add ((shl (mul x, 5), 3), x)
33731 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
33733 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33734 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33735 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33737 // mul x, 19 => add ((shl (mul x, 9), 1), x)
33738 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
33740 // mul x, 37 => add ((shl (mul x, 9), 2), x)
33741 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
33743 // mul x, 73 => add ((shl (mul x, 9), 3), x)
33744 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
33746 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33747 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33749 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
33750 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33752 // mul x, 26 => add ((mul (mul x, 5), 5), x)
33753 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
33755 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33756 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
33758 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33759 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33760 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
33763 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
33764 // by a single LEA.
33765 // First check if this a sum of two power of 2s because that's easy. Then
33766 // count how many zeros are up to the first bit.
33767 // TODO: We can do this even without LEA at a cost of two shifts and an add.
33768 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
33769 unsigned ScaleShift = countTrailingZeros(MulAmt);
33770 if (ScaleShift >= 1 && ScaleShift < 4) {
33771 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
33772 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33773 DAG.getConstant(ShiftAmt, DL, MVT::i8));
33774 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33775 DAG.getConstant(ScaleShift, DL, MVT::i8));
33776 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
33783 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33784 // which is always at least as quick as PMULLD, expect on KNL.
33785 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33786 const X86Subtarget &Subtarget) {
33787 if (!Subtarget.hasSSE2())
33790 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33793 EVT VT = N->getValueType(0);
33795 // Only support vXi32 vectors.
33796 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33799 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33800 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33801 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33804 SDValue N0 = N->getOperand(0);
33805 SDValue N1 = N->getOperand(1);
33806 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33807 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33808 !DAG.MaskedValueIsZero(N0, Mask17))
33811 // Use SplitOpsAndApply to handle AVX splitting.
33812 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33813 ArrayRef<SDValue> Ops) {
33814 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33815 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33817 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33818 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33822 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
33823 const X86Subtarget &Subtarget) {
33824 if (!Subtarget.hasSSE2())
33827 EVT VT = N->getValueType(0);
33829 // Only support vXi64 vectors.
33830 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
33831 !DAG.getTargetLoweringInfo().isTypeLegal(VT))
33834 SDValue N0 = N->getOperand(0);
33835 SDValue N1 = N->getOperand(1);
33837 // MULDQ returns the 64-bit result of the signed multiplication of the lower
33838 // 32-bits. We can lower with this if the sign bits stretch that far.
33839 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
33840 DAG.ComputeNumSignBits(N1) > 32) {
33841 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33842 ArrayRef<SDValue> Ops) {
33843 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
33845 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33846 PMULDQBuilder, /*CheckBWI*/false);
33849 // If the upper bits are zero we can use a single pmuludq.
33850 APInt Mask = APInt::getHighBitsSet(64, 32);
33851 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
33852 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33853 ArrayRef<SDValue> Ops) {
33854 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
33856 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33857 PMULUDQBuilder, /*CheckBWI*/false);
33863 /// Optimize a single multiply with constant into two operations in order to
33864 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33865 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33866 TargetLowering::DAGCombinerInfo &DCI,
33867 const X86Subtarget &Subtarget) {
33868 EVT VT = N->getValueType(0);
33870 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33873 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
33876 if (DCI.isBeforeLegalize() && VT.isVector())
33877 return reduceVMULWidth(N, DAG, Subtarget);
33879 if (!MulConstantOptimization)
33881 // An imul is usually smaller than the alternative sequence.
33882 if (DAG.getMachineFunction().getFunction().optForMinSize())
33885 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33888 if (VT != MVT::i64 && VT != MVT::i32)
33891 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33894 if (isPowerOf2_64(C->getZExtValue()))
33897 int64_t SignMulAmt = C->getSExtValue();
33898 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
33899 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
33902 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
33903 SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33904 DAG.getConstant(AbsMulAmt, DL, VT));
33905 if (SignMulAmt < 0)
33906 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
33912 uint64_t MulAmt1 = 0;
33913 uint64_t MulAmt2 = 0;
33914 if ((AbsMulAmt % 9) == 0) {
33916 MulAmt2 = AbsMulAmt / 9;
33917 } else if ((AbsMulAmt % 5) == 0) {
33919 MulAmt2 = AbsMulAmt / 5;
33920 } else if ((AbsMulAmt % 3) == 0) {
33922 MulAmt2 = AbsMulAmt / 3;
33926 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
33928 (isPowerOf2_64(MulAmt2) ||
33929 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
33931 if (isPowerOf2_64(MulAmt2) &&
33932 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33933 // If second multiplifer is pow2, issue it first. We want the multiply by
33934 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33936 std::swap(MulAmt1, MulAmt2);
33938 if (isPowerOf2_64(MulAmt1))
33939 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33940 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33942 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33943 DAG.getConstant(MulAmt1, DL, VT));
33945 if (isPowerOf2_64(MulAmt2))
33946 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33947 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33949 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33950 DAG.getConstant(MulAmt2, DL, VT));
33952 // Negate the result.
33953 if (SignMulAmt < 0)
33954 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
33956 } else if (!Subtarget.slowLEA())
33957 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
33960 assert(C->getZExtValue() != 0 &&
33961 C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33962 "Both cases that could cause potential overflows should have "
33963 "already been handled.");
33964 if (isPowerOf2_64(AbsMulAmt - 1)) {
33965 // (mul x, 2^N + 1) => (add (shl x, N), x)
33966 NewMul = DAG.getNode(
33967 ISD::ADD, DL, VT, N->getOperand(0),
33968 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33969 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
33971 // To negate, subtract the number from zero
33972 if (SignMulAmt < 0)
33973 NewMul = DAG.getNode(ISD::SUB, DL, VT,
33974 DAG.getConstant(0, DL, VT), NewMul);
33975 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
33976 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33977 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33978 DAG.getConstant(Log2_64(AbsMulAmt + 1),
33980 // To negate, reverse the operands of the subtract.
33981 if (SignMulAmt < 0)
33982 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
33984 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33985 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
33986 // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
33987 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33988 DAG.getConstant(Log2_64(AbsMulAmt - 2),
33990 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33991 NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
33992 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
33993 // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
33994 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33995 DAG.getConstant(Log2_64(AbsMulAmt + 2),
33997 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
33998 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
34005 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
34006 SDValue N0 = N->getOperand(0);
34007 SDValue N1 = N->getOperand(1);
34008 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
34009 EVT VT = N0.getValueType();
34011 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
34012 // since the result of setcc_c is all zero's or all ones.
34013 if (VT.isInteger() && !VT.isVector() &&
34014 N1C && N0.getOpcode() == ISD::AND &&
34015 N0.getOperand(1).getOpcode() == ISD::Constant) {
34016 SDValue N00 = N0.getOperand(0);
34017 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
34018 Mask <<= N1C->getAPIntValue();
34019 bool MaskOK = false;
34020 // We can handle cases concerning bit-widening nodes containing setcc_c if
34021 // we carefully interrogate the mask to make sure we are semantics
34023 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
34024 // of the underlying setcc_c operation if the setcc_c was zero extended.
34025 // Consider the following example:
34026 // zext(setcc_c) -> i32 0x0000FFFF
34027 // c1 -> i32 0x0000FFFF
34028 // c2 -> i32 0x00000001
34029 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
34030 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
34031 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34033 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
34034 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34036 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
34037 N00.getOpcode() == ISD::ANY_EXTEND) &&
34038 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
34039 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
34041 if (MaskOK && Mask != 0) {
34043 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
34047 // Hardware support for vector shifts is sparse which makes us scalarize the
34048 // vector operations in many cases. Also, on sandybridge ADD is faster than
34050 // (shl V, 1) -> add V,V
34051 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
34052 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
34053 assert(N0.getValueType().isVector() && "Invalid vector shift type");
34054 // We shift all of the values by one. In many cases we do not have
34055 // hardware support for this operation. This is better expressed as an ADD
34057 if (N1SplatC->getAPIntValue() == 1)
34058 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
34064 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
34065 SDValue N0 = N->getOperand(0);
34066 SDValue N1 = N->getOperand(1);
34067 EVT VT = N0.getValueType();
34068 unsigned Size = VT.getSizeInBits();
34070 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
34071 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
34072 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
34073 // depending on sign of (SarConst - [56,48,32,24,16])
34075 // sexts in X86 are MOVs. The MOVs have the same code size
34076 // as above SHIFTs (only SHIFT on 1 has lower code size).
34077 // However the MOVs have 2 advantages to a SHIFT:
34078 // 1. MOVs can write to a register that differs from source
34079 // 2. MOVs accept memory operands
34081 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
34082 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
34083 N0.getOperand(1).getOpcode() != ISD::Constant)
34086 SDValue N00 = N0.getOperand(0);
34087 SDValue N01 = N0.getOperand(1);
34088 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
34089 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
34090 EVT CVT = N1.getValueType();
34092 if (SarConst.isNegative())
34095 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
34096 unsigned ShiftSize = SVT.getSizeInBits();
34097 // skipping types without corresponding sext/zext and
34098 // ShlConst that is not one of [56,48,32,24,16]
34099 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
34103 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
34104 SarConst = SarConst - (Size - ShiftSize);
34107 else if (SarConst.isNegative())
34108 return DAG.getNode(ISD::SHL, DL, VT, NN,
34109 DAG.getConstant(-SarConst, DL, CVT));
34111 return DAG.getNode(ISD::SRA, DL, VT, NN,
34112 DAG.getConstant(SarConst, DL, CVT));
34117 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
34118 TargetLowering::DAGCombinerInfo &DCI) {
34119 SDValue N0 = N->getOperand(0);
34120 SDValue N1 = N->getOperand(1);
34121 EVT VT = N0.getValueType();
34123 // Only do this on the last DAG combine as it can interfere with other
34125 if (!DCI.isAfterLegalizeDAG())
34128 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
34129 // TODO: This is a generic DAG combine that became an x86-only combine to
34130 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
34131 // and-not ('andn').
34132 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
34135 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
34136 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
34137 if (!ShiftC || !AndC)
34140 // If we can shrink the constant mask below 8-bits or 32-bits, then this
34141 // transform should reduce code size. It may also enable secondary transforms
34142 // from improved known-bits analysis or instruction selection.
34143 APInt MaskVal = AndC->getAPIntValue();
34145 // If this can be matched by a zero extend, don't optimize.
34146 if (MaskVal.isMask()) {
34147 unsigned TO = MaskVal.countTrailingOnes();
34148 if (TO >= 8 && isPowerOf2_32(TO))
34152 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
34153 unsigned OldMaskSize = MaskVal.getMinSignedBits();
34154 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
34155 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
34156 (OldMaskSize > 32 && NewMaskSize <= 32)) {
34157 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
34159 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
34160 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
34161 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
34166 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
34167 TargetLowering::DAGCombinerInfo &DCI,
34168 const X86Subtarget &Subtarget) {
34169 if (N->getOpcode() == ISD::SHL)
34170 if (SDValue V = combineShiftLeft(N, DAG))
34173 if (N->getOpcode() == ISD::SRA)
34174 if (SDValue V = combineShiftRightArithmetic(N, DAG))
34177 if (N->getOpcode() == ISD::SRL)
34178 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
34184 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
34185 TargetLowering::DAGCombinerInfo &DCI,
34186 const X86Subtarget &Subtarget) {
34187 unsigned Opcode = N->getOpcode();
34188 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
34189 "Unexpected shift opcode");
34191 EVT VT = N->getValueType(0);
34192 SDValue N0 = N->getOperand(0);
34193 SDValue N1 = N->getOperand(1);
34194 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
34195 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
34196 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
34197 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
34198 "Unexpected PACKSS/PACKUS input type");
34200 // Constant Folding.
34201 APInt UndefElts0, UndefElts1;
34202 SmallVector<APInt, 32> EltBits0, EltBits1;
34203 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
34204 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
34205 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
34206 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
34207 unsigned NumLanes = VT.getSizeInBits() / 128;
34208 unsigned NumDstElts = VT.getVectorNumElements();
34209 unsigned NumSrcElts = NumDstElts / 2;
34210 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
34211 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
34212 bool IsSigned = (X86ISD::PACKSS == Opcode);
34214 APInt Undefs(NumDstElts, 0);
34215 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
34216 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
34217 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
34218 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
34219 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
34220 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
34222 if (UndefElts[SrcIdx]) {
34223 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
34227 APInt &Val = EltBits[SrcIdx];
34229 // PACKSS: Truncate signed value with signed saturation.
34230 // Source values less than dst minint are saturated to minint.
34231 // Source values greater than dst maxint are saturated to maxint.
34232 if (Val.isSignedIntN(DstBitsPerElt))
34233 Val = Val.trunc(DstBitsPerElt);
34234 else if (Val.isNegative())
34235 Val = APInt::getSignedMinValue(DstBitsPerElt);
34237 Val = APInt::getSignedMaxValue(DstBitsPerElt);
34239 // PACKUS: Truncate signed value with unsigned saturation.
34240 // Source values less than zero are saturated to zero.
34241 // Source values greater than dst maxuint are saturated to maxuint.
34242 if (Val.isIntN(DstBitsPerElt))
34243 Val = Val.trunc(DstBitsPerElt);
34244 else if (Val.isNegative())
34245 Val = APInt::getNullValue(DstBitsPerElt);
34247 Val = APInt::getAllOnesValue(DstBitsPerElt);
34249 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
34253 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
34256 // Attempt to combine as shuffle.
34259 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34260 /*HasVarMask*/ false, DAG, Subtarget))
34266 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
34267 TargetLowering::DAGCombinerInfo &DCI,
34268 const X86Subtarget &Subtarget) {
34269 unsigned Opcode = N->getOpcode();
34270 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
34271 X86ISD::VSRLI == Opcode) &&
34272 "Unexpected shift opcode");
34273 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
34274 EVT VT = N->getValueType(0);
34275 SDValue N0 = N->getOperand(0);
34276 SDValue N1 = N->getOperand(1);
34277 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
34278 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
34279 "Unexpected value type");
34281 // Out of range logical bit shifts are guaranteed to be zero.
34282 // Out of range arithmetic bit shifts splat the sign bit.
34283 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
34284 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
34286 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34288 ShiftVal = NumBitsPerElt - 1;
34291 // Shift N0 by zero -> N0.
34295 // Shift zero -> zero.
34296 if (ISD::isBuildVectorAllZeros(N0.getNode()))
34297 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34299 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
34300 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
34301 // TODO - support other sra opcodes as needed.
34302 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
34303 N0.getOpcode() == X86ISD::VSRAI)
34304 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
34306 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34307 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
34308 N1 == N0.getOperand(1)) {
34309 SDValue N00 = N0.getOperand(0);
34310 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
34311 if (ShiftVal.ult(NumSignBits))
34315 // We can decode 'whole byte' logical bit shifts as shuffles.
34316 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
34318 if (SDValue Res = combineX86ShufflesRecursively(
34319 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34320 /*HasVarMask*/ false, DAG, Subtarget))
34324 // Constant Folding.
34326 SmallVector<APInt, 32> EltBits;
34327 if (N->isOnlyUserOf(N0.getNode()) &&
34328 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
34329 assert(EltBits.size() == VT.getVectorNumElements() &&
34330 "Unexpected shift value type");
34331 unsigned ShiftImm = ShiftVal.getZExtValue();
34332 for (APInt &Elt : EltBits) {
34333 if (X86ISD::VSHLI == Opcode)
34335 else if (X86ISD::VSRAI == Opcode)
34336 Elt.ashrInPlace(ShiftImm);
34338 Elt.lshrInPlace(ShiftImm);
34340 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
34346 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
34347 TargetLowering::DAGCombinerInfo &DCI,
34348 const X86Subtarget &Subtarget) {
34350 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
34351 (N->getOpcode() == X86ISD::PINSRW &&
34352 N->getValueType(0) == MVT::v8i16)) &&
34353 "Unexpected vector insertion");
34355 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
34358 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34359 /*HasVarMask*/ false, DAG, Subtarget))
34365 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
34366 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
34367 /// OR -> CMPNEQSS.
34368 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
34369 TargetLowering::DAGCombinerInfo &DCI,
34370 const X86Subtarget &Subtarget) {
34373 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
34374 // we're requiring SSE2 for both.
34375 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
34376 SDValue N0 = N->getOperand(0);
34377 SDValue N1 = N->getOperand(1);
34378 SDValue CMP0 = N0->getOperand(1);
34379 SDValue CMP1 = N1->getOperand(1);
34382 // The SETCCs should both refer to the same CMP.
34383 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
34386 SDValue CMP00 = CMP0->getOperand(0);
34387 SDValue CMP01 = CMP0->getOperand(1);
34388 EVT VT = CMP00.getValueType();
34390 if (VT == MVT::f32 || VT == MVT::f64) {
34391 bool ExpectingFlags = false;
34392 // Check for any users that want flags:
34393 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
34394 !ExpectingFlags && UI != UE; ++UI)
34395 switch (UI->getOpcode()) {
34400 ExpectingFlags = true;
34402 case ISD::CopyToReg:
34403 case ISD::SIGN_EXTEND:
34404 case ISD::ZERO_EXTEND:
34405 case ISD::ANY_EXTEND:
34409 if (!ExpectingFlags) {
34410 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
34411 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
34413 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
34414 X86::CondCode tmp = cc0;
34419 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
34420 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
34421 // FIXME: need symbolic constants for these magic numbers.
34422 // See X86ATTInstPrinter.cpp:printSSECC().
34423 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
34424 if (Subtarget.hasAVX512()) {
34426 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
34427 DAG.getConstant(x86cc, DL, MVT::i8));
34428 // Need to fill with zeros to ensure the bitcast will produce zeroes
34429 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
34430 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
34431 DAG.getConstant(0, DL, MVT::v16i1),
34432 FSetCC, DAG.getIntPtrConstant(0, DL));
34433 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
34434 N->getSimpleValueType(0));
34436 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
34437 CMP00.getValueType(), CMP00, CMP01,
34438 DAG.getConstant(x86cc, DL,
34441 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
34442 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
34444 if (is64BitFP && !Subtarget.is64Bit()) {
34445 // On a 32-bit target, we cannot bitcast the 64-bit float to a
34446 // 64-bit integer, since that's not a legal type. Since
34447 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
34448 // bits, but can do this little dance to extract the lowest 32 bits
34449 // and work with those going forward.
34450 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
34452 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
34453 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
34454 Vector32, DAG.getIntPtrConstant(0, DL));
34458 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
34459 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
34460 DAG.getConstant(1, DL, IntVT));
34461 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
34463 return OneBitOfTruth;
34471 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
34472 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
34473 if (N->getOpcode() != ISD::AND)
34476 SDValue N0 = N->getOperand(0);
34477 SDValue N1 = N->getOperand(1);
34478 if (N0.getOpcode() == ISD::XOR &&
34479 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
34480 X = N0.getOperand(0);
34484 if (N1.getOpcode() == ISD::XOR &&
34485 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
34486 X = N1.getOperand(0);
34494 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
34495 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
34496 assert(N->getOpcode() == ISD::AND);
34498 EVT VT = N->getValueType(0);
34499 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
34503 if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
34504 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
34509 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
34510 // register. In most cases we actually compare or select YMM-sized registers
34511 // and mixing the two types creates horrible code. This method optimizes
34512 // some of the transition sequences.
34513 // Even with AVX-512 this is still useful for removing casts around logical
34514 // operations on vXi1 mask types.
34515 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
34516 const X86Subtarget &Subtarget) {
34517 EVT VT = N->getValueType(0);
34518 assert(VT.isVector() && "Expected vector type");
34520 assert((N->getOpcode() == ISD::ANY_EXTEND ||
34521 N->getOpcode() == ISD::ZERO_EXTEND ||
34522 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
34524 SDValue Narrow = N->getOperand(0);
34525 EVT NarrowVT = Narrow.getValueType();
34527 if (Narrow->getOpcode() != ISD::XOR &&
34528 Narrow->getOpcode() != ISD::AND &&
34529 Narrow->getOpcode() != ISD::OR)
34532 SDValue N0 = Narrow->getOperand(0);
34533 SDValue N1 = Narrow->getOperand(1);
34536 // The Left side has to be a trunc.
34537 if (N0.getOpcode() != ISD::TRUNCATE)
34540 // The type of the truncated inputs.
34541 if (N0->getOperand(0).getValueType() != VT)
34544 // The right side has to be a 'trunc' or a constant vector.
34545 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
34546 N1.getOperand(0).getValueType() == VT;
34548 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
34551 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34553 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
34556 // Set N0 and N1 to hold the inputs to the new wide operation.
34557 N0 = N0->getOperand(0);
34559 N1 = N1->getOperand(0);
34561 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
34563 // Generate the wide operation.
34564 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
34565 unsigned Opcode = N->getOpcode();
34567 default: llvm_unreachable("Unexpected opcode");
34568 case ISD::ANY_EXTEND:
34570 case ISD::ZERO_EXTEND:
34571 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
34572 case ISD::SIGN_EXTEND:
34573 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
34574 Op, DAG.getValueType(NarrowVT));
34578 /// If both input operands of a logic op are being cast from floating point
34579 /// types, try to convert this into a floating point logic node to avoid
34580 /// unnecessary moves from SSE to integer registers.
34581 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
34582 const X86Subtarget &Subtarget) {
34583 unsigned FPOpcode = ISD::DELETED_NODE;
34584 if (N->getOpcode() == ISD::AND)
34585 FPOpcode = X86ISD::FAND;
34586 else if (N->getOpcode() == ISD::OR)
34587 FPOpcode = X86ISD::FOR;
34588 else if (N->getOpcode() == ISD::XOR)
34589 FPOpcode = X86ISD::FXOR;
34591 assert(FPOpcode != ISD::DELETED_NODE &&
34592 "Unexpected input node for FP logic conversion");
34594 EVT VT = N->getValueType(0);
34595 SDValue N0 = N->getOperand(0);
34596 SDValue N1 = N->getOperand(1);
34598 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
34599 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
34600 (Subtarget.hasSSE2() && VT == MVT::i64))) {
34601 SDValue N00 = N0.getOperand(0);
34602 SDValue N10 = N1.getOperand(0);
34603 EVT N00Type = N00.getValueType();
34604 EVT N10Type = N10.getValueType();
34605 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
34606 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
34607 return DAG.getBitcast(VT, FPLogic);
34613 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
34614 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
34615 /// with a shift-right to eliminate loading the vector constant mask value.
34616 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
34617 const X86Subtarget &Subtarget) {
34618 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
34619 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
34620 EVT VT0 = Op0.getValueType();
34621 EVT VT1 = Op1.getValueType();
34623 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
34627 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
34628 !SplatVal.isMask())
34631 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
34634 unsigned EltBitWidth = VT0.getScalarSizeInBits();
34635 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
34639 unsigned ShiftVal = SplatVal.countTrailingOnes();
34640 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
34641 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
34642 return DAG.getBitcast(N->getValueType(0), Shift);
34645 // Get the index node from the lowered DAG of a GEP IR instruction with one
34646 // indexing dimension.
34647 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
34648 if (Ld->isIndexed())
34651 SDValue Base = Ld->getBasePtr();
34653 if (Base.getOpcode() != ISD::ADD)
34656 SDValue ShiftedIndex = Base.getOperand(0);
34658 if (ShiftedIndex.getOpcode() != ISD::SHL)
34661 return ShiftedIndex.getOperand(0);
34665 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
34666 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
34667 switch (VT.getSizeInBits()) {
34668 default: return false;
34669 case 64: return Subtarget.is64Bit() ? true : false;
34670 case 32: return true;
34676 // This function recognizes cases where X86 bzhi instruction can replace and
34677 // 'and-load' sequence.
34678 // In case of loading integer value from an array of constants which is defined
34681 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
34683 // then applying a bitwise and on the result with another input.
34684 // It's equivalent to performing bzhi (zero high bits) on the input, with the
34685 // same index of the load.
34686 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
34687 const X86Subtarget &Subtarget) {
34688 MVT VT = Node->getSimpleValueType(0);
34691 // Check if subtarget has BZHI instruction for the node's type
34692 if (!hasBZHI(Subtarget, VT))
34695 // Try matching the pattern for both operands.
34696 for (unsigned i = 0; i < 2; i++) {
34697 SDValue N = Node->getOperand(i);
34698 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
34700 // continue if the operand is not a load instruction
34704 const Value *MemOp = Ld->getMemOperand()->getValue();
34709 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
34710 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
34711 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
34713 Constant *Init = GV->getInitializer();
34714 Type *Ty = Init->getType();
34715 if (!isa<ConstantDataArray>(Init) ||
34716 !Ty->getArrayElementType()->isIntegerTy() ||
34717 Ty->getArrayElementType()->getScalarSizeInBits() !=
34718 VT.getSizeInBits() ||
34719 Ty->getArrayNumElements() >
34720 Ty->getArrayElementType()->getScalarSizeInBits())
34723 // Check if the array's constant elements are suitable to our case.
34724 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34725 bool ConstantsMatch = true;
34726 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34727 ConstantInt *Elem =
34728 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34729 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34730 ConstantsMatch = false;
34734 if (!ConstantsMatch)
34737 // Do the transformation (For 32-bit type):
34738 // -> (and (load arr[idx]), inp)
34739 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34740 // that will be replaced with one bzhi instruction.
34741 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34742 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
34744 // Get the Node which indexes into the array.
34745 SDValue Index = getIndexFromUnindexedLoad(Ld);
34748 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
34750 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
34751 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
34753 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34754 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34756 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34764 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34765 TargetLowering::DAGCombinerInfo &DCI,
34766 const X86Subtarget &Subtarget) {
34767 EVT VT = N->getValueType(0);
34769 // If this is SSE1 only convert to FAND to avoid scalarization.
34770 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34771 return DAG.getBitcast(
34772 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34773 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34774 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34777 // Use a 32-bit and+zext if upper bits known zero.
34778 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34779 !isa<ConstantSDNode>(N->getOperand(1))) {
34780 APInt HiMask = APInt::getHighBitsSet(64, 32);
34781 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34782 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34784 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34785 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34786 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34787 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34791 if (DCI.isBeforeLegalizeOps())
34794 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34797 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34800 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34803 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34806 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34809 // Attempt to recursively combine a bitmask AND with shuffles.
34810 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34812 if (SDValue Res = combineX86ShufflesRecursively(
34813 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34814 /*HasVarMask*/ false, DAG, Subtarget))
34818 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34819 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34820 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34821 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34822 SDValue BitMask = N->getOperand(1);
34823 SDValue SrcVec = N->getOperand(0).getOperand(0);
34824 EVT SrcVecVT = SrcVec.getValueType();
34826 // Check that the constant bitmask masks whole bytes.
34828 SmallVector<APInt, 64> EltBits;
34829 if (VT == SrcVecVT.getScalarType() &&
34830 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34831 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34832 llvm::all_of(EltBits, [](APInt M) {
34833 return M.isNullValue() || M.isAllOnesValue();
34835 unsigned NumElts = SrcVecVT.getVectorNumElements();
34836 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34837 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34839 // Create a root shuffle mask from the byte mask and the extracted index.
34840 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34841 for (unsigned i = 0; i != Scale; ++i) {
34844 int VecIdx = Scale * Idx + i;
34845 ShuffleMask[VecIdx] =
34846 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34849 if (SDValue Shuffle = combineX86ShufflesRecursively(
34850 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34851 /*HasVarMask*/ false, DAG, Subtarget))
34852 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34853 N->getOperand(0).getOperand(1));
34860 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
34861 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
34862 if (N->getOpcode() != ISD::OR)
34865 SDValue N0 = N->getOperand(0);
34866 SDValue N1 = N->getOperand(1);
34868 // Canonicalize AND to LHS.
34869 if (N1.getOpcode() == ISD::AND)
34872 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
34873 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34876 Mask = N1.getOperand(0);
34877 X = N1.getOperand(1);
34879 // Check to see if the mask appeared in both the AND and ANDNP.
34880 if (N0.getOperand(0) == Mask)
34881 Y = N0.getOperand(1);
34882 else if (N0.getOperand(1) == Mask)
34883 Y = N0.getOperand(0);
34887 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
34888 // ANDNP combine allows other combines to happen that prevent matching.
34893 // (or (and (m, y), (pandn m, x)))
34895 // (vselect m, x, y)
34896 // As a special case, try to fold:
34897 // (or (and (m, (sub 0, x)), (pandn m, x)))
34899 // (sub (xor X, M), M)
34900 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34901 const X86Subtarget &Subtarget) {
34902 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34904 EVT VT = N->getValueType(0);
34905 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34906 (VT.is256BitVector() && Subtarget.hasInt256())))
34909 SDValue X, Y, Mask;
34910 if (!matchLogicBlend(N, X, Y, Mask))
34913 // Validate that X, Y, and Mask are bitcasts, and see through them.
34914 Mask = peekThroughBitcasts(Mask);
34915 X = peekThroughBitcasts(X);
34916 Y = peekThroughBitcasts(Y);
34918 EVT MaskVT = Mask.getValueType();
34919 unsigned EltBits = MaskVT.getScalarSizeInBits();
34921 // TODO: Attempt to handle floating point cases as well?
34922 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34928 // (or (and (M, (sub 0, X)), (pandn M, X)))
34929 // which is a special case of vselect:
34930 // (vselect M, (sub 0, X), X)
34932 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34933 // We know that, if fNegate is 0 or 1:
34934 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34936 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34937 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34938 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34939 // This lets us transform our vselect to:
34940 // (add (xor X, M), (and M, 1))
34942 // (sub (xor X, M), M)
34943 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34944 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34945 auto IsNegV = [](SDNode *N, SDValue V) {
34946 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34947 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34950 if (IsNegV(Y.getNode(), X))
34952 else if (IsNegV(X.getNode(), Y))
34956 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34957 SDValue SubOp2 = Mask;
34959 // If the negate was on the false side of the select, then
34960 // the operands of the SUB need to be swapped. PR 27251.
34961 // This is because the pattern being matched above is
34962 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34963 // but if the pattern matched was
34964 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34965 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34966 // pattern also needs to be a negation of the replacement pattern above.
34967 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34968 // sub accomplishes the negation of the replacement pattern.
34970 std::swap(SubOp1, SubOp2);
34972 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34973 return DAG.getBitcast(VT, Res);
34977 // PBLENDVB is only available on SSE 4.1.
34978 if (!Subtarget.hasSSE41())
34981 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34983 X = DAG.getBitcast(BlendVT, X);
34984 Y = DAG.getBitcast(BlendVT, Y);
34985 Mask = DAG.getBitcast(BlendVT, Mask);
34986 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34987 return DAG.getBitcast(VT, Mask);
34990 // Helper function for combineOrCmpEqZeroToCtlzSrl
34994 // srl(ctlz x), log2(bitsize(x))
34995 // Input pattern is checked by caller.
34996 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34997 SelectionDAG &DAG) {
34998 SDValue Cmp = Op.getOperand(1);
34999 EVT VT = Cmp.getOperand(0).getValueType();
35000 unsigned Log2b = Log2_32(VT.getSizeInBits());
35002 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
35003 // The result of the shift is true or false, and on X86, the 32-bit
35004 // encoding of shr and lzcnt is more desirable.
35005 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
35006 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
35007 DAG.getConstant(Log2b, dl, MVT::i8));
35008 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
35011 // Try to transform:
35012 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
35014 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
35015 // Will also attempt to match more generic cases, eg:
35016 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
35017 // Only applies if the target supports the FastLZCNT feature.
35018 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
35019 TargetLowering::DAGCombinerInfo &DCI,
35020 const X86Subtarget &Subtarget) {
35021 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
35024 auto isORCandidate = [](SDValue N) {
35025 return (N->getOpcode() == ISD::OR && N->hasOneUse());
35028 // Check the zero extend is extending to 32-bit or more. The code generated by
35029 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
35030 // instructions to clear the upper bits.
35031 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
35032 !isORCandidate(N->getOperand(0)))
35035 // Check the node matches: setcc(eq, cmp 0)
35036 auto isSetCCCandidate = [](SDValue N) {
35037 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
35038 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
35039 N->getOperand(1).getOpcode() == X86ISD::CMP &&
35040 isNullConstant(N->getOperand(1).getOperand(1)) &&
35041 N->getOperand(1).getValueType().bitsGE(MVT::i32);
35044 SDNode *OR = N->getOperand(0).getNode();
35045 SDValue LHS = OR->getOperand(0);
35046 SDValue RHS = OR->getOperand(1);
35048 // Save nodes matching or(or, setcc(eq, cmp 0)).
35049 SmallVector<SDNode *, 2> ORNodes;
35050 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
35051 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
35052 ORNodes.push_back(OR);
35053 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
35054 LHS = OR->getOperand(0);
35055 RHS = OR->getOperand(1);
35058 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
35059 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
35060 !isORCandidate(SDValue(OR, 0)))
35063 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
35065 // or(srl(ctlz),srl(ctlz)).
35066 // The dag combiner can then fold it into:
35067 // srl(or(ctlz, ctlz)).
35068 EVT VT = OR->getValueType(0);
35069 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
35070 SDValue Ret, NewRHS;
35071 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
35072 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
35077 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
35078 while (ORNodes.size() > 0) {
35079 OR = ORNodes.pop_back_val();
35080 LHS = OR->getOperand(0);
35081 RHS = OR->getOperand(1);
35082 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
35083 if (RHS->getOpcode() == ISD::OR)
35084 std::swap(LHS, RHS);
35085 EVT VT = OR->getValueType(0);
35086 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
35089 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
35093 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
35098 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
35099 TargetLowering::DAGCombinerInfo &DCI,
35100 const X86Subtarget &Subtarget) {
35101 SDValue N0 = N->getOperand(0);
35102 SDValue N1 = N->getOperand(1);
35103 EVT VT = N->getValueType(0);
35105 // If this is SSE1 only convert to FOR to avoid scalarization.
35106 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
35107 return DAG.getBitcast(MVT::v4i32,
35108 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
35109 DAG.getBitcast(MVT::v4f32, N0),
35110 DAG.getBitcast(MVT::v4f32, N1)));
35113 if (DCI.isBeforeLegalizeOps())
35116 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
35119 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35122 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
35125 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
35128 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
35129 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
35131 // SHLD/SHRD instructions have lower register pressure, but on some
35132 // platforms they have higher latency than the equivalent
35133 // series of shifts/or that would otherwise be generated.
35134 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
35135 // have higher latencies and we are not optimizing for size.
35136 if (!OptForSize && Subtarget.isSHLDSlow())
35139 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
35141 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
35143 if (!N0.hasOneUse() || !N1.hasOneUse())
35146 SDValue ShAmt0 = N0.getOperand(1);
35147 if (ShAmt0.getValueType() != MVT::i8)
35149 SDValue ShAmt1 = N1.getOperand(1);
35150 if (ShAmt1.getValueType() != MVT::i8)
35152 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
35153 ShAmt0 = ShAmt0.getOperand(0);
35154 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
35155 ShAmt1 = ShAmt1.getOperand(0);
35158 unsigned Opc = X86ISD::SHLD;
35159 SDValue Op0 = N0.getOperand(0);
35160 SDValue Op1 = N1.getOperand(0);
35161 if (ShAmt0.getOpcode() == ISD::SUB ||
35162 ShAmt0.getOpcode() == ISD::XOR) {
35163 Opc = X86ISD::SHRD;
35164 std::swap(Op0, Op1);
35165 std::swap(ShAmt0, ShAmt1);
35168 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
35169 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
35170 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
35171 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
35172 unsigned Bits = VT.getSizeInBits();
35173 if (ShAmt1.getOpcode() == ISD::SUB) {
35174 SDValue Sum = ShAmt1.getOperand(0);
35175 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
35176 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
35177 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
35178 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
35179 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
35180 return DAG.getNode(Opc, DL, VT,
35182 DAG.getNode(ISD::TRUNCATE, DL,
35185 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
35186 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
35187 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
35188 return DAG.getNode(Opc, DL, VT,
35189 N0.getOperand(0), N1.getOperand(0),
35190 DAG.getNode(ISD::TRUNCATE, DL,
35192 } else if (ShAmt1.getOpcode() == ISD::XOR) {
35193 SDValue Mask = ShAmt1.getOperand(1);
35194 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
35195 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
35196 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
35197 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
35198 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
35199 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
35200 if (Op1.getOpcode() == InnerShift &&
35201 isa<ConstantSDNode>(Op1.getOperand(1)) &&
35202 Op1.getConstantOperandVal(1) == 1) {
35203 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35204 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35206 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
35207 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
35208 Op1.getOperand(0) == Op1.getOperand(1)) {
35209 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35210 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35219 /// Try to turn tests against the signbit in the form of:
35220 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
35223 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
35224 // This is only worth doing if the output type is i8 or i1.
35225 EVT ResultType = N->getValueType(0);
35226 if (ResultType != MVT::i8 && ResultType != MVT::i1)
35229 SDValue N0 = N->getOperand(0);
35230 SDValue N1 = N->getOperand(1);
35232 // We should be performing an xor against a truncated shift.
35233 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
35236 // Make sure we are performing an xor against one.
35237 if (!isOneConstant(N1))
35240 // SetCC on x86 zero extends so only act on this if it's a logical shift.
35241 SDValue Shift = N0.getOperand(0);
35242 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
35245 // Make sure we are truncating from one of i16, i32 or i64.
35246 EVT ShiftTy = Shift.getValueType();
35247 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
35250 // Make sure the shift amount extracts the sign bit.
35251 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
35252 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
35255 // Create a greater-than comparison against -1.
35256 // N.B. Using SETGE against 0 works but we want a canonical looking
35257 // comparison, using SETGT matches up with what TranslateX86CC.
35259 SDValue ShiftOp = Shift.getOperand(0);
35260 EVT ShiftOpTy = ShiftOp.getValueType();
35261 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35262 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
35263 *DAG.getContext(), ResultType);
35264 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
35265 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
35266 if (SetCCResultType != ResultType)
35267 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
35271 /// Turn vector tests of the signbit in the form of:
35272 /// xor (sra X, elt_size(X)-1), -1
35276 /// This should be called before type legalization because the pattern may not
35277 /// persist after that.
35278 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
35279 const X86Subtarget &Subtarget) {
35280 EVT VT = N->getValueType(0);
35281 if (!VT.isSimple())
35284 switch (VT.getSimpleVT().SimpleTy) {
35285 default: return SDValue();
35288 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
35289 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
35293 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
35296 // There must be a shift right algebraic before the xor, and the xor must be a
35297 // 'not' operation.
35298 SDValue Shift = N->getOperand(0);
35299 SDValue Ones = N->getOperand(1);
35300 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
35301 !ISD::isBuildVectorAllOnes(Ones.getNode()))
35304 // The shift should be smearing the sign bit across each vector element.
35305 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
35309 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
35310 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
35311 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
35314 // Create a greater-than comparison against -1. We don't use the more obvious
35315 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
35316 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
35319 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
35320 /// is valid for the given \p Subtarget.
35321 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
35322 const X86Subtarget &Subtarget) {
35323 if (!Subtarget.hasAVX512())
35326 // FIXME: Scalar type may be supported if we move it to vector register.
35327 if (!SrcVT.isVector())
35330 EVT SrcElVT = SrcVT.getScalarType();
35331 EVT DstElVT = DstVT.getScalarType();
35332 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
35334 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
35335 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
35339 /// Detect patterns of truncation with unsigned saturation:
35341 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35342 /// Return the source value x to be truncated or SDValue() if the pattern was
35345 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
35346 /// where C1 >= 0 and C2 is unsigned max of destination type.
35348 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
35349 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
35351 /// These two patterns are equivalent to:
35352 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
35353 /// So return the smax(x, C1) value to be truncated or SDValue() if the
35354 /// pattern was not matched.
35355 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35357 EVT InVT = In.getValueType();
35359 // Saturation with truncation. We truncate from InVT to VT.
35360 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
35361 "Unexpected types for truncate operation");
35363 // Match min/max and return limit value as a parameter.
35364 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
35365 if (V.getOpcode() == Opcode &&
35366 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
35367 return V.getOperand(0);
35372 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
35373 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
35374 // the element size of the destination type.
35375 if (C2.isMask(VT.getScalarSizeInBits()))
35378 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
35379 if (MatchMinMax(SMin, ISD::SMAX, C1))
35380 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
35383 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
35384 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
35385 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
35387 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
35393 /// Detect patterns of truncation with signed saturation:
35394 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
35395 /// signed_max_of_dest_type)) to dest_type)
35397 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
35398 /// signed_min_of_dest_type)) to dest_type).
35399 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
35400 /// Return the source value to be truncated or SDValue() if the pattern was not
35402 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
35403 unsigned NumDstBits = VT.getScalarSizeInBits();
35404 unsigned NumSrcBits = In.getScalarValueSizeInBits();
35405 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
35407 auto MatchMinMax = [](SDValue V, unsigned Opcode,
35408 const APInt &Limit) -> SDValue {
35410 if (V.getOpcode() == Opcode &&
35411 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
35412 return V.getOperand(0);
35416 APInt SignedMax, SignedMin;
35418 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
35419 SignedMin = APInt(NumSrcBits, 0);
35421 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
35422 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
35425 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
35426 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
35429 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
35430 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
35436 /// Detect a pattern of truncation with signed saturation.
35437 /// The types should allow to use VPMOVSS* instruction on AVX512.
35438 /// Return the source value to be truncated or SDValue() if the pattern was not
35440 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
35441 const X86Subtarget &Subtarget,
35442 const TargetLowering &TLI) {
35443 if (!TLI.isTypeLegal(In.getValueType()))
35445 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35447 return detectSSatPattern(In, VT);
35450 /// Detect a pattern of truncation with saturation:
35451 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35452 /// The types should allow to use VPMOVUS* instruction on AVX512.
35453 /// Return the source value to be truncated or SDValue() if the pattern was not
35455 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35457 const X86Subtarget &Subtarget,
35458 const TargetLowering &TLI) {
35459 if (!TLI.isTypeLegal(In.getValueType()))
35461 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35463 return detectUSatPattern(In, VT, DAG, DL);
35466 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
35468 const X86Subtarget &Subtarget) {
35469 EVT SVT = VT.getScalarType();
35470 EVT InVT = In.getValueType();
35471 EVT InSVT = InVT.getScalarType();
35472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35473 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
35474 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
35475 if (auto SSatVal = detectSSatPattern(In, VT))
35476 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
35477 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
35478 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
35480 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
35481 (SVT == MVT::i8 || SVT == MVT::i16) &&
35482 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
35483 if (auto USatVal = detectSSatPattern(In, VT, true)) {
35484 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
35485 if (SVT == MVT::i8 && InSVT == MVT::i32) {
35486 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35487 VT.getVectorNumElements());
35488 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
35491 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
35493 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
35494 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
35497 if (auto SSatVal = detectSSatPattern(In, VT))
35498 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
35504 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
35505 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
35506 /// X86ISD::AVG instruction.
35507 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35508 const X86Subtarget &Subtarget,
35510 if (!VT.isVector())
35512 EVT InVT = In.getValueType();
35513 unsigned NumElems = VT.getVectorNumElements();
35515 EVT ScalarVT = VT.getVectorElementType();
35516 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
35517 isPowerOf2_32(NumElems)))
35520 // InScalarVT is the intermediate type in AVG pattern and it should be greater
35521 // than the original input type (i8/i16).
35522 EVT InScalarVT = InVT.getVectorElementType();
35523 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
35526 if (!Subtarget.hasSSE2())
35529 // Detect the following pattern:
35531 // %1 = zext <N x i8> %a to <N x i32>
35532 // %2 = zext <N x i8> %b to <N x i32>
35533 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
35534 // %4 = add nuw nsw <N x i32> %3, %2
35535 // %5 = lshr <N x i32> %N, <i32 1 x N>
35536 // %6 = trunc <N x i32> %5 to <N x i8>
35538 // In AVX512, the last instruction can also be a trunc store.
35539 if (In.getOpcode() != ISD::SRL)
35542 // A lambda checking the given SDValue is a constant vector and each element
35543 // is in the range [Min, Max].
35544 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
35545 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
35546 if (!BV || !BV->isConstant())
35548 for (SDValue Op : V->ops()) {
35549 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
35552 const APInt &Val = C->getAPIntValue();
35553 if (Val.ult(Min) || Val.ugt(Max))
35559 // Check if each element of the vector is left-shifted by one.
35560 auto LHS = In.getOperand(0);
35561 auto RHS = In.getOperand(1);
35562 if (!IsConstVectorInRange(RHS, 1, 1))
35564 if (LHS.getOpcode() != ISD::ADD)
35567 // Detect a pattern of a + b + 1 where the order doesn't matter.
35568 SDValue Operands[3];
35569 Operands[0] = LHS.getOperand(0);
35570 Operands[1] = LHS.getOperand(1);
35572 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35573 ArrayRef<SDValue> Ops) {
35574 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
35577 // Take care of the case when one of the operands is a constant vector whose
35578 // element is in the range [1, 256].
35579 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
35580 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
35581 Operands[0].getOperand(0).getValueType() == VT) {
35582 // The pattern is detected. Subtract one from the constant vector, then
35583 // demote it and emit X86ISD::AVG instruction.
35584 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
35585 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
35586 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
35587 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35588 { Operands[0].getOperand(0), Operands[1] },
35592 if (Operands[0].getOpcode() == ISD::ADD)
35593 std::swap(Operands[0], Operands[1]);
35594 else if (Operands[1].getOpcode() != ISD::ADD)
35596 Operands[2] = Operands[1].getOperand(0);
35597 Operands[1] = Operands[1].getOperand(1);
35599 // Now we have three operands of two additions. Check that one of them is a
35600 // constant vector with ones, and the other two are promoted from i8/i16.
35601 for (int i = 0; i < 3; ++i) {
35602 if (!IsConstVectorInRange(Operands[i], 1, 1))
35604 std::swap(Operands[i], Operands[2]);
35606 // Check if Operands[0] and Operands[1] are results of type promotion.
35607 for (int j = 0; j < 2; ++j)
35608 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
35609 Operands[j].getOperand(0).getValueType() != VT)
35612 // The pattern is detected, emit X86ISD::AVG instruction(s).
35613 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35614 { Operands[0].getOperand(0),
35615 Operands[1].getOperand(0) }, AVGBuilder);
35621 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
35622 TargetLowering::DAGCombinerInfo &DCI,
35623 const X86Subtarget &Subtarget) {
35624 LoadSDNode *Ld = cast<LoadSDNode>(N);
35625 EVT RegVT = Ld->getValueType(0);
35626 EVT MemVT = Ld->getMemoryVT();
35628 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35630 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
35631 // into two 16-byte operations. Also split non-temporal aligned loads on
35632 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
35633 ISD::LoadExtType Ext = Ld->getExtensionType();
35635 unsigned AddressSpace = Ld->getAddressSpace();
35636 unsigned Alignment = Ld->getAlignment();
35637 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
35638 Ext == ISD::NON_EXTLOAD &&
35639 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
35640 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
35641 AddressSpace, Alignment, &Fast) && !Fast))) {
35642 unsigned NumElems = RegVT.getVectorNumElements();
35646 SDValue Ptr = Ld->getBasePtr();
35648 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
35651 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
35652 Alignment, Ld->getMemOperand()->getFlags());
35654 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
35656 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
35657 Ld->getPointerInfo().getWithOffset(16),
35658 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
35659 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
35661 Load2.getValue(1));
35663 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
35664 return DCI.CombineTo(N, NewVec, TF, true);
35670 /// If V is a build vector of boolean constants and exactly one of those
35671 /// constants is true, return the operand index of that true element.
35672 /// Otherwise, return -1.
35673 static int getOneTrueElt(SDValue V) {
35674 // This needs to be a build vector of booleans.
35675 // TODO: Checking for the i1 type matches the IR definition for the mask,
35676 // but the mask check could be loosened to i8 or other types. That might
35677 // also require checking more than 'allOnesValue'; eg, the x86 HW
35678 // instructions only require that the MSB is set for each mask element.
35679 // The ISD::MSTORE comments/definition do not specify how the mask operand
35681 auto *BV = dyn_cast<BuildVectorSDNode>(V);
35682 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
35685 int TrueIndex = -1;
35686 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
35687 for (unsigned i = 0; i < NumElts; ++i) {
35688 const SDValue &Op = BV->getOperand(i);
35691 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
35694 if (ConstNode->getAPIntValue().isAllOnesValue()) {
35695 // If we already found a one, this is too many.
35696 if (TrueIndex >= 0)
35704 /// Given a masked memory load/store operation, return true if it has one mask
35705 /// bit set. If it has one mask bit set, then also return the memory address of
35706 /// the scalar element to load/store, the vector index to insert/extract that
35707 /// scalar element, and the alignment for the scalar memory access.
35708 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
35709 SelectionDAG &DAG, SDValue &Addr,
35710 SDValue &Index, unsigned &Alignment) {
35711 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
35712 if (TrueMaskElt < 0)
35715 // Get the address of the one scalar element that is specified by the mask
35716 // using the appropriate offset from the base pointer.
35717 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
35718 Addr = MaskedOp->getBasePtr();
35719 if (TrueMaskElt != 0) {
35720 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
35721 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
35724 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
35725 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
35729 /// If exactly one element of the mask is set for a non-extending masked load,
35730 /// it is a scalar load and vector insert.
35731 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35732 /// mask have already been optimized in IR, so we don't bother with those here.
35734 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35735 TargetLowering::DAGCombinerInfo &DCI) {
35736 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35737 // However, some target hooks may need to be added to know when the transform
35738 // is profitable. Endianness would also have to be considered.
35740 SDValue Addr, VecIndex;
35741 unsigned Alignment;
35742 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
35745 // Load the one scalar element that is specified by the mask using the
35746 // appropriate offset from the base pointer.
35748 EVT VT = ML->getValueType(0);
35749 EVT EltVT = VT.getVectorElementType();
35751 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
35752 Alignment, ML->getMemOperand()->getFlags());
35754 // Insert the loaded element into the appropriate place in the vector.
35755 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
35757 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
35761 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35762 TargetLowering::DAGCombinerInfo &DCI) {
35763 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35767 EVT VT = ML->getValueType(0);
35769 // If we are loading the first and last elements of a vector, it is safe and
35770 // always faster to load the whole vector. Replace the masked load with a
35771 // vector load and select.
35772 unsigned NumElts = VT.getVectorNumElements();
35773 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35774 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35775 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35776 if (LoadFirstElt && LoadLastElt) {
35777 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35778 ML->getMemOperand());
35779 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35780 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35783 // Convert a masked load with a constant mask into a masked load and a select.
35784 // This allows the select operation to use a faster kind of select instruction
35785 // (for example, vblendvps -> vblendps).
35787 // Don't try this if the pass-through operand is already undefined. That would
35788 // cause an infinite loop because that's what we're about to create.
35789 if (ML->getSrc0().isUndef())
35792 // The new masked load has an undef pass-through operand. The select uses the
35793 // original pass-through operand.
35794 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35795 ML->getMask(), DAG.getUNDEF(VT),
35796 ML->getMemoryVT(), ML->getMemOperand(),
35797 ML->getExtensionType());
35798 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35800 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35803 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35804 TargetLowering::DAGCombinerInfo &DCI,
35805 const X86Subtarget &Subtarget) {
35806 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35808 // TODO: Expanding load with constant mask may be optimized as well.
35809 if (Mld->isExpandingLoad())
35812 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35813 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35815 // TODO: Do some AVX512 subsets benefit from this transform?
35816 if (!Subtarget.hasAVX512())
35817 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35821 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35824 // Resolve extending loads.
35825 EVT VT = Mld->getValueType(0);
35826 unsigned NumElems = VT.getVectorNumElements();
35827 EVT LdVT = Mld->getMemoryVT();
35830 assert(LdVT != VT && "Cannot extend to the same type");
35831 unsigned ToSz = VT.getScalarSizeInBits();
35832 unsigned FromSz = LdVT.getScalarSizeInBits();
35833 // From/To sizes and ElemCount must be pow of two.
35834 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35835 "Unexpected size for extending masked load");
35837 unsigned SizeRatio = ToSz / FromSz;
35838 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35840 // Create a type on which we perform the shuffle.
35841 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35842 LdVT.getScalarType(), NumElems*SizeRatio);
35843 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35845 // Convert Src0 value.
35846 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35847 if (!Mld->getSrc0().isUndef()) {
35848 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35849 for (unsigned i = 0; i != NumElems; ++i)
35850 ShuffleVec[i] = i * SizeRatio;
35852 // Can't shuffle using an illegal type.
35853 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35854 "WideVecVT should be legal");
35855 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35856 DAG.getUNDEF(WideVecVT), ShuffleVec);
35859 // Prepare the new mask.
35861 SDValue Mask = Mld->getMask();
35862 if (Mask.getValueType() == VT) {
35863 // Mask and original value have the same type.
35864 NewMask = DAG.getBitcast(WideVecVT, Mask);
35865 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35866 for (unsigned i = 0; i != NumElems; ++i)
35867 ShuffleVec[i] = i * SizeRatio;
35868 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35869 ShuffleVec[i] = NumElems * SizeRatio;
35870 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35871 DAG.getConstant(0, dl, WideVecVT),
35874 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35875 unsigned WidenNumElts = NumElems*SizeRatio;
35876 unsigned MaskNumElts = VT.getVectorNumElements();
35877 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35880 unsigned NumConcat = WidenNumElts / MaskNumElts;
35881 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35882 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35884 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35887 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35888 Mld->getBasePtr(), NewMask, WideSrc0,
35889 Mld->getMemoryVT(), Mld->getMemOperand(),
35891 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35892 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35895 /// If exactly one element of the mask is set for a non-truncating masked store,
35896 /// it is a vector extract and scalar store.
35897 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35898 /// mask have already been optimized in IR, so we don't bother with those here.
35899 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35900 SelectionDAG &DAG) {
35901 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35902 // However, some target hooks may need to be added to know when the transform
35903 // is profitable. Endianness would also have to be considered.
35905 SDValue Addr, VecIndex;
35906 unsigned Alignment;
35907 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35910 // Extract the one scalar element that is actually being stored.
35912 EVT VT = MS->getValue().getValueType();
35913 EVT EltVT = VT.getVectorElementType();
35914 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35915 MS->getValue(), VecIndex);
35917 // Store that element at the appropriate offset from the base pointer.
35918 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35919 Alignment, MS->getMemOperand()->getFlags());
35922 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35923 const X86Subtarget &Subtarget) {
35924 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35926 if (Mst->isCompressingStore())
35929 if (!Mst->isTruncatingStore()) {
35930 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35931 return ScalarStore;
35933 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35934 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35935 // cares about the sign bit of each mask element, so eliminate the compare:
35936 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35937 // Note that by waiting to match an x86-specific PCMPGT node, we're
35938 // eliminating potentially more complex matching of a setcc node which has
35939 // a full range of predicates.
35940 SDValue Mask = Mst->getMask();
35941 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35942 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35943 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35944 "Unexpected type for PCMPGT");
35945 return DAG.getMaskedStore(
35946 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35947 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35950 // TODO: AVX512 targets should also be able to simplify something like the
35951 // pattern above, but that pattern will be different. It will either need to
35952 // match setcc more generally or match PCMPGTM later (in tablegen?).
35957 // Resolve truncating stores.
35958 EVT VT = Mst->getValue().getValueType();
35959 unsigned NumElems = VT.getVectorNumElements();
35960 EVT StVT = Mst->getMemoryVT();
35963 assert(StVT != VT && "Cannot truncate to the same type");
35964 unsigned FromSz = VT.getScalarSizeInBits();
35965 unsigned ToSz = StVT.getScalarSizeInBits();
35967 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35969 // The truncating store is legal in some cases. For example
35970 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35971 // are designated for truncate store.
35972 // In this case we don't need any further transformations.
35973 if (TLI.isTruncStoreLegal(VT, StVT))
35976 // From/To sizes and ElemCount must be pow of two.
35977 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35978 "Unexpected size for truncating masked store");
35979 // We are going to use the original vector elt for storing.
35980 // Accumulated smaller vector elements must be a multiple of the store size.
35981 assert (((NumElems * FromSz) % ToSz) == 0 &&
35982 "Unexpected ratio for truncating masked store");
35984 unsigned SizeRatio = FromSz / ToSz;
35985 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35987 // Create a type on which we perform the shuffle.
35988 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35989 StVT.getScalarType(), NumElems*SizeRatio);
35991 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35993 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35994 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35995 for (unsigned i = 0; i != NumElems; ++i)
35996 ShuffleVec[i] = i * SizeRatio;
35998 // Can't shuffle using an illegal type.
35999 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
36000 "WideVecVT should be legal");
36002 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36003 DAG.getUNDEF(WideVecVT),
36007 SDValue Mask = Mst->getMask();
36008 if (Mask.getValueType() == VT) {
36009 // Mask and original value have the same type.
36010 NewMask = DAG.getBitcast(WideVecVT, Mask);
36011 for (unsigned i = 0; i != NumElems; ++i)
36012 ShuffleVec[i] = i * SizeRatio;
36013 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
36014 ShuffleVec[i] = NumElems*SizeRatio;
36015 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
36016 DAG.getConstant(0, dl, WideVecVT),
36019 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
36020 unsigned WidenNumElts = NumElems*SizeRatio;
36021 unsigned MaskNumElts = VT.getVectorNumElements();
36022 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
36025 unsigned NumConcat = WidenNumElts / MaskNumElts;
36026 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
36027 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
36029 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
36032 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
36033 Mst->getBasePtr(), NewMask, StVT,
36034 Mst->getMemOperand(), false);
36037 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
36038 const X86Subtarget &Subtarget) {
36039 StoreSDNode *St = cast<StoreSDNode>(N);
36040 EVT VT = St->getValue().getValueType();
36041 EVT StVT = St->getMemoryVT();
36043 SDValue StoredVal = St->getOperand(1);
36044 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36046 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
36047 // This will avoid a copy to k-register.
36048 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
36049 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36050 StoredVal.getOperand(0).getValueType() == MVT::i8) {
36051 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
36052 St->getBasePtr(), St->getPointerInfo(),
36053 St->getAlignment(), St->getMemOperand()->getFlags());
36056 // Widen v2i1/v4i1 stores to v8i1.
36057 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
36058 Subtarget.hasAVX512()) {
36059 unsigned NumConcats = 8 / VT.getVectorNumElements();
36060 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
36061 Ops[0] = StoredVal;
36062 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
36063 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36064 St->getPointerInfo(), St->getAlignment(),
36065 St->getMemOperand()->getFlags());
36068 // Turn vXi1 stores of constants into a scalar store.
36069 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
36070 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
36071 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
36072 // If its a v64i1 store without 64-bit support, we need two stores.
36073 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
36074 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
36075 StoredVal->ops().slice(0, 32));
36076 Lo = combinevXi1ConstantToInteger(Lo, DAG);
36077 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
36078 StoredVal->ops().slice(32, 32));
36079 Hi = combinevXi1ConstantToInteger(Hi, DAG);
36081 unsigned Alignment = St->getAlignment();
36083 SDValue Ptr0 = St->getBasePtr();
36084 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
36087 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
36088 Alignment, St->getMemOperand()->getFlags());
36090 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
36091 St->getPointerInfo().getWithOffset(4),
36092 MinAlign(Alignment, 4U),
36093 St->getMemOperand()->getFlags());
36094 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36097 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
36098 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36099 St->getPointerInfo(), St->getAlignment(),
36100 St->getMemOperand()->getFlags());
36103 // If we are saving a concatenation of two XMM registers and 32-byte stores
36104 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
36106 unsigned AddressSpace = St->getAddressSpace();
36107 unsigned Alignment = St->getAlignment();
36108 if (VT.is256BitVector() && StVT == VT &&
36109 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
36110 AddressSpace, Alignment, &Fast) &&
36112 unsigned NumElems = VT.getVectorNumElements();
36116 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
36117 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
36119 SDValue Ptr0 = St->getBasePtr();
36120 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
36123 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
36124 Alignment, St->getMemOperand()->getFlags());
36126 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
36127 St->getPointerInfo().getWithOffset(16),
36128 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
36129 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36132 // Optimize trunc store (of multiple scalars) to shuffle and store.
36133 // First, pack all of the elements in one place. Next, store to memory
36134 // in fewer chunks.
36135 if (St->isTruncatingStore() && VT.isVector()) {
36136 // Check if we can detect an AVG pattern from the truncation. If yes,
36137 // replace the trunc store by a normal store with the result of X86ISD::AVG
36139 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
36141 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
36142 St->getPointerInfo(), St->getAlignment(),
36143 St->getMemOperand()->getFlags());
36145 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36147 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
36149 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
36150 dl, Val, St->getBasePtr(),
36151 St->getMemoryVT(), St->getMemOperand(), DAG);
36152 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
36153 DAG, dl, Subtarget, TLI))
36154 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
36155 dl, Val, St->getBasePtr(),
36156 St->getMemoryVT(), St->getMemOperand(), DAG);
36158 unsigned NumElems = VT.getVectorNumElements();
36159 assert(StVT != VT && "Cannot truncate to the same type");
36160 unsigned FromSz = VT.getScalarSizeInBits();
36161 unsigned ToSz = StVT.getScalarSizeInBits();
36163 // The truncating store is legal in some cases. For example
36164 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
36165 // are designated for truncate store.
36166 // In this case we don't need any further transformations.
36167 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
36170 // From, To sizes and ElemCount must be pow of two
36171 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
36172 // We are going to use the original vector elt for storing.
36173 // Accumulated smaller vector elements must be a multiple of the store size.
36174 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
36176 unsigned SizeRatio = FromSz / ToSz;
36178 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
36180 // Create a type on which we perform the shuffle
36181 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
36182 StVT.getScalarType(), NumElems*SizeRatio);
36184 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
36186 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
36187 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
36188 for (unsigned i = 0; i != NumElems; ++i)
36189 ShuffleVec[i] = i * SizeRatio;
36191 // Can't shuffle using an illegal type.
36192 if (!TLI.isTypeLegal(WideVecVT))
36195 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36196 DAG.getUNDEF(WideVecVT),
36198 // At this point all of the data is stored at the bottom of the
36199 // register. We now need to save it to mem.
36201 // Find the largest store unit
36202 MVT StoreType = MVT::i8;
36203 for (MVT Tp : MVT::integer_valuetypes()) {
36204 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
36208 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
36209 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
36210 (64 <= NumElems * ToSz))
36211 StoreType = MVT::f64;
36213 // Bitcast the original vector into a vector of store-size units
36214 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
36215 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
36216 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
36217 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
36218 SmallVector<SDValue, 8> Chains;
36219 SDValue Ptr = St->getBasePtr();
36221 // Perform one or more big stores into memory.
36222 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
36223 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
36224 StoreType, ShuffWide,
36225 DAG.getIntPtrConstant(i, dl));
36227 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
36228 St->getAlignment(), St->getMemOperand()->getFlags());
36229 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
36230 Chains.push_back(Ch);
36233 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
36236 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
36237 // the FP state in cases where an emms may be missing.
36238 // A preferable solution to the general problem is to figure out the right
36239 // places to insert EMMS. This qualifies as a quick hack.
36241 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
36242 if (VT.getSizeInBits() != 64)
36245 const Function &F = DAG.getMachineFunction().getFunction();
36246 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
36248 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
36249 if ((VT.isVector() ||
36250 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
36251 isa<LoadSDNode>(St->getValue()) &&
36252 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
36253 St->getChain().hasOneUse() && !St->isVolatile()) {
36254 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
36255 SmallVector<SDValue, 8> Ops;
36257 if (!ISD::isNormalLoad(Ld))
36260 // If this is not the MMX case, i.e. we are just turning i64 load/store
36261 // into f64 load/store, avoid the transformation if there are multiple
36262 // uses of the loaded value.
36263 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
36268 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
36269 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
36271 if (Subtarget.is64Bit() || F64IsLegal) {
36272 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
36273 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
36274 Ld->getMemOperand());
36276 // Make sure new load is placed in same chain order.
36277 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
36278 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
36279 St->getMemOperand());
36282 // Otherwise, lower to two pairs of 32-bit loads / stores.
36283 SDValue LoAddr = Ld->getBasePtr();
36284 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
36286 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
36287 Ld->getPointerInfo(), Ld->getAlignment(),
36288 Ld->getMemOperand()->getFlags());
36289 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
36290 Ld->getPointerInfo().getWithOffset(4),
36291 MinAlign(Ld->getAlignment(), 4),
36292 Ld->getMemOperand()->getFlags());
36293 // Make sure new loads are placed in same chain order.
36294 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
36295 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
36297 LoAddr = St->getBasePtr();
36298 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
36301 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
36302 St->getAlignment(), St->getMemOperand()->getFlags());
36303 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
36304 St->getPointerInfo().getWithOffset(4),
36305 MinAlign(St->getAlignment(), 4),
36306 St->getMemOperand()->getFlags());
36307 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
36310 // This is similar to the above case, but here we handle a scalar 64-bit
36311 // integer store that is extracted from a vector on a 32-bit target.
36312 // If we have SSE2, then we can treat it like a floating-point double
36313 // to get past legalization. The execution dependencies fixup pass will
36314 // choose the optimal machine instruction for the store if this really is
36315 // an integer or v2f32 rather than an f64.
36316 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
36317 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
36318 SDValue OldExtract = St->getOperand(1);
36319 SDValue ExtOp0 = OldExtract.getOperand(0);
36320 unsigned VecSize = ExtOp0.getValueSizeInBits();
36321 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
36322 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
36323 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
36324 BitCast, OldExtract.getOperand(1));
36325 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
36326 St->getPointerInfo(), St->getAlignment(),
36327 St->getMemOperand()->getFlags());
36333 /// Return 'true' if this vector operation is "horizontal"
36334 /// and return the operands for the horizontal operation in LHS and RHS. A
36335 /// horizontal operation performs the binary operation on successive elements
36336 /// of its first operand, then on successive elements of its second operand,
36337 /// returning the resulting values in a vector. For example, if
36338 /// A = < float a0, float a1, float a2, float a3 >
36340 /// B = < float b0, float b1, float b2, float b3 >
36341 /// then the result of doing a horizontal operation on A and B is
36342 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
36343 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
36344 /// A horizontal-op B, for some already available A and B, and if so then LHS is
36345 /// set to A, RHS to B, and the routine returns 'true'.
36346 /// Note that the binary operation should have the property that if one of the
36347 /// operands is UNDEF then the result is UNDEF.
36348 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
36349 // Look for the following pattern: if
36350 // A = < float a0, float a1, float a2, float a3 >
36351 // B = < float b0, float b1, float b2, float b3 >
36353 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
36354 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
36355 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
36356 // which is A horizontal-op B.
36358 // At least one of the operands should be a vector shuffle.
36359 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
36360 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
36363 MVT VT = LHS.getSimpleValueType();
36365 assert((VT.is128BitVector() || VT.is256BitVector()) &&
36366 "Unsupported vector type for horizontal add/sub");
36368 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
36369 // operate independently on 128-bit lanes.
36370 unsigned NumElts = VT.getVectorNumElements();
36371 unsigned NumLanes = VT.getSizeInBits()/128;
36372 unsigned NumLaneElts = NumElts / NumLanes;
36373 assert((NumLaneElts % 2 == 0) &&
36374 "Vector type should have an even number of elements in each lane");
36375 unsigned HalfLaneElts = NumLaneElts/2;
36377 // View LHS in the form
36378 // LHS = VECTOR_SHUFFLE A, B, LMask
36379 // If LHS is not a shuffle then pretend it is the shuffle
36380 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
36381 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
36384 SmallVector<int, 16> LMask(NumElts);
36385 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36386 if (!LHS.getOperand(0).isUndef())
36387 A = LHS.getOperand(0);
36388 if (!LHS.getOperand(1).isUndef())
36389 B = LHS.getOperand(1);
36390 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
36391 std::copy(Mask.begin(), Mask.end(), LMask.begin());
36393 if (!LHS.isUndef())
36395 for (unsigned i = 0; i != NumElts; ++i)
36399 // Likewise, view RHS in the form
36400 // RHS = VECTOR_SHUFFLE C, D, RMask
36402 SmallVector<int, 16> RMask(NumElts);
36403 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36404 if (!RHS.getOperand(0).isUndef())
36405 C = RHS.getOperand(0);
36406 if (!RHS.getOperand(1).isUndef())
36407 D = RHS.getOperand(1);
36408 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
36409 std::copy(Mask.begin(), Mask.end(), RMask.begin());
36411 if (!RHS.isUndef())
36413 for (unsigned i = 0; i != NumElts; ++i)
36417 // Check that the shuffles are both shuffling the same vectors.
36418 if (!(A == C && B == D) && !(A == D && B == C))
36421 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
36422 if (!A.getNode() && !B.getNode())
36425 // If A and B occur in reverse order in RHS, then "swap" them (which means
36426 // rewriting the mask).
36428 ShuffleVectorSDNode::commuteMask(RMask);
36430 // At this point LHS and RHS are equivalent to
36431 // LHS = VECTOR_SHUFFLE A, B, LMask
36432 // RHS = VECTOR_SHUFFLE A, B, RMask
36433 // Check that the masks correspond to performing a horizontal operation.
36434 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
36435 for (unsigned i = 0; i != NumLaneElts; ++i) {
36436 int LIdx = LMask[i+l], RIdx = RMask[i+l];
36438 // Ignore any UNDEF components.
36439 if (LIdx < 0 || RIdx < 0 ||
36440 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
36441 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
36444 // Check that successive elements are being operated on. If not, this is
36445 // not a horizontal operation.
36446 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
36447 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
36448 if (!(LIdx == Index && RIdx == Index + 1) &&
36449 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
36454 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
36455 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
36459 /// Do target-specific dag combines on floating-point adds/subs.
36460 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
36461 const X86Subtarget &Subtarget) {
36462 EVT VT = N->getValueType(0);
36463 SDValue LHS = N->getOperand(0);
36464 SDValue RHS = N->getOperand(1);
36465 bool IsFadd = N->getOpcode() == ISD::FADD;
36466 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
36468 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
36469 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
36470 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
36471 isHorizontalBinOp(LHS, RHS, IsFadd)) {
36472 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
36473 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
36478 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
36480 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
36481 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
36482 const X86Subtarget &Subtarget,
36484 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
36485 SDValue Src = N->getOperand(0);
36486 unsigned Opcode = Src.getOpcode();
36487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36489 EVT VT = N->getValueType(0);
36490 EVT SrcVT = Src.getValueType();
36492 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
36493 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
36495 // Repeated operand, so we are only trading one output truncation for
36496 // one input truncation.
36500 // See if either operand has been extended from a smaller/equal size to
36501 // the truncation size, allowing a truncation to combine with the extend.
36502 unsigned Opcode0 = Op0.getOpcode();
36503 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
36504 Opcode0 == ISD::ZERO_EXTEND) &&
36505 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36508 unsigned Opcode1 = Op1.getOpcode();
36509 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
36510 Opcode1 == ISD::ZERO_EXTEND) &&
36511 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36514 // See if either operand is a single use constant which can be constant
36516 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
36517 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
36518 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
36519 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
36522 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
36523 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
36524 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
36525 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
36528 // Don't combine if the operation has other uses.
36529 if (!N->isOnlyUserOf(Src.getNode()))
36532 // Only support vector truncation for now.
36533 // TODO: i64 scalar math would benefit as well.
36534 if (!VT.isVector())
36537 // In most cases its only worth pre-truncating if we're only facing the cost
36538 // of one truncation.
36539 // i.e. if one of the inputs will constant fold or the input is repeated.
36544 SDValue Op0 = Src.getOperand(0);
36545 SDValue Op1 = Src.getOperand(1);
36546 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
36547 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36548 return TruncateArithmetic(Op0, Op1);
36553 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
36554 // better to truncate if we have the chance.
36555 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
36556 !TLI.isOperationLegal(Opcode, SrcVT))
36557 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
36560 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
36561 SDValue Op0 = Src.getOperand(0);
36562 SDValue Op1 = Src.getOperand(1);
36563 if (TLI.isOperationLegal(Opcode, VT) &&
36564 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36565 return TruncateArithmetic(Op0, Op1);
36573 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
36574 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
36575 const X86Subtarget &Subtarget,
36576 SelectionDAG &DAG) {
36577 SDValue In = N->getOperand(0);
36578 EVT InVT = In.getValueType();
36579 EVT InSVT = InVT.getVectorElementType();
36580 EVT OutVT = N->getValueType(0);
36581 EVT OutSVT = OutVT.getVectorElementType();
36583 // Split a long vector into vectors of legal type and mask to unset all bits
36584 // that won't appear in the result to prevent saturation.
36585 // TODO - we should be doing this at the maximum legal size but this is
36586 // causing regressions where we're concatenating back to max width just to
36587 // perform the AND and then extracting back again.....
36588 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
36589 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
36590 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
36591 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
36594 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
36595 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
36597 for (unsigned i = 0; i < NumSubRegs; i++) {
36598 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
36599 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
36600 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
36602 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
36604 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
36607 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
36608 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
36609 const X86Subtarget &Subtarget,
36610 SelectionDAG &DAG) {
36611 SDValue In = N->getOperand(0);
36612 EVT InVT = In.getValueType();
36613 EVT OutVT = N->getValueType(0);
36614 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
36615 DAG.getValueType(OutVT));
36616 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
36619 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
36620 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
36621 /// legalization the truncation will be translated into a BUILD_VECTOR with each
36622 /// element that is extracted from a vector and then truncated, and it is
36623 /// difficult to do this optimization based on them.
36624 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
36625 const X86Subtarget &Subtarget) {
36626 EVT OutVT = N->getValueType(0);
36627 if (!OutVT.isVector())
36630 SDValue In = N->getOperand(0);
36631 if (!In.getValueType().isSimple())
36634 EVT InVT = In.getValueType();
36635 unsigned NumElems = OutVT.getVectorNumElements();
36637 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
36638 // SSE2, and we need to take care of it specially.
36639 // AVX512 provides vpmovdb.
36640 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
36643 EVT OutSVT = OutVT.getVectorElementType();
36644 EVT InSVT = InVT.getVectorElementType();
36645 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
36646 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
36650 // SSSE3's pshufb results in less instructions in the cases below.
36651 if (Subtarget.hasSSSE3() && NumElems == 8 &&
36652 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
36653 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
36657 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
36658 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
36659 // truncate 2 x v4i32 to v8i16.
36660 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
36661 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
36662 if (InSVT == MVT::i32)
36663 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
36668 /// This function transforms vector truncation of 'extended sign-bits' or
36669 /// 'extended zero-bits' values.
36670 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
36671 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
36673 const X86Subtarget &Subtarget) {
36674 // Requires SSE2 but AVX512 has fast truncate.
36675 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36678 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
36681 SDValue In = N->getOperand(0);
36682 if (!In.getValueType().isSimple())
36685 MVT VT = N->getValueType(0).getSimpleVT();
36686 MVT SVT = VT.getScalarType();
36688 MVT InVT = In.getValueType().getSimpleVT();
36689 MVT InSVT = InVT.getScalarType();
36691 // Check we have a truncation suited for PACKSS/PACKUS.
36692 if (!VT.is128BitVector() && !VT.is256BitVector())
36694 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
36696 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
36699 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
36700 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
36702 // Use PACKUS if the input has zero-bits that extend all the way to the
36703 // packed/truncated value. e.g. masks, zext_in_reg, etc.
36705 DAG.computeKnownBits(In, Known);
36706 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
36707 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
36708 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36710 // Use PACKSS if the input has sign-bits that extend all the way to the
36711 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
36712 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
36713 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
36714 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
36719 // Try to form a MULHU or MULHS node by looking for
36720 // (trunc (srl (mul ext, ext), 16))
36721 // TODO: This is X86 specific because we want to be able to handle wide types
36722 // before type legalization. But we can only do it if the vector will be
36723 // legalized via widening/splitting. Type legalization can't handle promotion
36724 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
36726 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
36727 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36728 // First instruction should be a right shift of a multiply.
36729 if (Src.getOpcode() != ISD::SRL ||
36730 Src.getOperand(0).getOpcode() != ISD::MUL)
36733 if (!Subtarget.hasSSE2())
36736 // Only handle vXi16 types that are at least 128-bits.
36737 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
36738 VT.getVectorNumElements() < 8)
36741 // Input type should be vXi32.
36742 EVT InVT = Src.getValueType();
36743 if (InVT.getVectorElementType() != MVT::i32)
36746 // Need a shift by 16.
36748 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
36752 SDValue LHS = Src.getOperand(0).getOperand(0);
36753 SDValue RHS = Src.getOperand(0).getOperand(1);
36755 unsigned ExtOpc = LHS.getOpcode();
36756 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
36757 RHS.getOpcode() != ExtOpc)
36760 // Peek through the extends.
36761 LHS = LHS.getOperand(0);
36762 RHS = RHS.getOperand(0);
36764 // Ensure the input types match.
36765 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
36768 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
36769 return DAG.getNode(Opc, DL, VT, LHS, RHS);
36772 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
36773 // from one vector with signed bytes from another vector, adds together
36774 // adjacent pairs of 16-bit products, and saturates the result before
36775 // truncating to 16-bits.
36777 // Which looks something like this:
36778 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
36779 // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
36780 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
36781 const X86Subtarget &Subtarget,
36783 if (!VT.isVector() || !Subtarget.hasSSSE3())
36786 unsigned NumElems = VT.getVectorNumElements();
36787 EVT ScalarVT = VT.getVectorElementType();
36788 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
36791 SDValue SSatVal = detectSSatPattern(In, VT);
36792 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
36795 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
36796 // of multiplies from even/odd elements.
36797 SDValue N0 = SSatVal.getOperand(0);
36798 SDValue N1 = SSatVal.getOperand(1);
36800 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
36803 SDValue N00 = N0.getOperand(0);
36804 SDValue N01 = N0.getOperand(1);
36805 SDValue N10 = N1.getOperand(0);
36806 SDValue N11 = N1.getOperand(1);
36808 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
36809 // Canonicalize zero_extend to LHS.
36810 if (N01.getOpcode() == ISD::ZERO_EXTEND)
36811 std::swap(N00, N01);
36812 if (N11.getOpcode() == ISD::ZERO_EXTEND)
36813 std::swap(N10, N11);
36815 // Ensure we have a zero_extend and a sign_extend.
36816 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
36817 N01.getOpcode() != ISD::SIGN_EXTEND ||
36818 N10.getOpcode() != ISD::ZERO_EXTEND ||
36819 N11.getOpcode() != ISD::SIGN_EXTEND)
36822 // Peek through the extends.
36823 N00 = N00.getOperand(0);
36824 N01 = N01.getOperand(0);
36825 N10 = N10.getOperand(0);
36826 N11 = N11.getOperand(0);
36828 // Ensure the extend is from vXi8.
36829 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
36830 N01.getValueType().getVectorElementType() != MVT::i8 ||
36831 N10.getValueType().getVectorElementType() != MVT::i8 ||
36832 N11.getValueType().getVectorElementType() != MVT::i8)
36835 // All inputs should be build_vectors.
36836 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
36837 N01.getOpcode() != ISD::BUILD_VECTOR ||
36838 N10.getOpcode() != ISD::BUILD_VECTOR ||
36839 N11.getOpcode() != ISD::BUILD_VECTOR)
36842 // N00/N10 are zero extended. N01/N11 are sign extended.
36844 // For each element, we need to ensure we have an odd element from one vector
36845 // multiplied by the odd element of another vector and the even element from
36846 // one of the same vectors being multiplied by the even element from the
36847 // other vector. So we need to make sure for each element i, this operator
36848 // is being performed:
36849 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
36850 SDValue ZExtIn, SExtIn;
36851 for (unsigned i = 0; i != NumElems; ++i) {
36852 SDValue N00Elt = N00.getOperand(i);
36853 SDValue N01Elt = N01.getOperand(i);
36854 SDValue N10Elt = N10.getOperand(i);
36855 SDValue N11Elt = N11.getOperand(i);
36856 // TODO: Be more tolerant to undefs.
36857 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36858 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36859 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
36860 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
36862 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
36863 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
36864 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
36865 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
36866 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
36868 unsigned IdxN00 = ConstN00Elt->getZExtValue();
36869 unsigned IdxN01 = ConstN01Elt->getZExtValue();
36870 unsigned IdxN10 = ConstN10Elt->getZExtValue();
36871 unsigned IdxN11 = ConstN11Elt->getZExtValue();
36872 // Add is commutative so indices can be reordered.
36873 if (IdxN00 > IdxN10) {
36874 std::swap(IdxN00, IdxN10);
36875 std::swap(IdxN01, IdxN11);
36877 // N0 indices be the even element. N1 indices must be the next odd element.
36878 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
36879 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
36881 SDValue N00In = N00Elt.getOperand(0);
36882 SDValue N01In = N01Elt.getOperand(0);
36883 SDValue N10In = N10Elt.getOperand(0);
36884 SDValue N11In = N11Elt.getOperand(0);
36885 // First time we find an input capture it.
36890 if (ZExtIn != N00In || SExtIn != N01In ||
36891 ZExtIn != N10In || SExtIn != N11In)
36895 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
36896 ArrayRef<SDValue> Ops) {
36897 // Shrink by adding truncate nodes and let DAGCombine fold with the
36899 EVT InVT = Ops[0].getValueType();
36900 assert(InVT.getScalarType() == MVT::i8 &&
36901 "Unexpected scalar element type");
36902 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
36903 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
36904 InVT.getVectorNumElements() / 2);
36905 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
36907 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
36911 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36912 const X86Subtarget &Subtarget) {
36913 EVT VT = N->getValueType(0);
36914 SDValue Src = N->getOperand(0);
36917 // Attempt to pre-truncate inputs to arithmetic ops instead.
36918 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36921 // Try to detect AVG pattern first.
36922 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36925 // Try to detect PMADD
36926 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
36929 // Try to combine truncation with signed/unsigned saturation.
36930 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36933 // Try to combine PMULHUW/PMULHW for vXi16.
36934 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
36937 // The bitcast source is a direct mmx result.
36938 // Detect bitcasts between i32 to x86mmx
36939 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36940 SDValue BCSrc = Src.getOperand(0);
36941 if (BCSrc.getValueType() == MVT::x86mmx)
36942 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36945 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36946 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36949 return combineVectorTruncation(N, DAG, Subtarget);
36952 /// Returns the negated value if the node \p N flips sign of FP value.
36954 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36955 /// AVX512F does not have FXOR, so FNEG is lowered as
36956 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36957 /// In this case we go though all bitcasts.
36958 static SDValue isFNEG(SDNode *N) {
36959 if (N->getOpcode() == ISD::FNEG)
36960 return N->getOperand(0);
36962 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36963 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36966 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36967 if (!Op1.getValueType().isFloatingPoint())
36970 // Extract constant bits and see if they are all sign bit masks.
36972 SmallVector<APInt, 16> EltBits;
36973 if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
36974 UndefElts, EltBits, false, false))
36975 if (llvm::all_of(EltBits, [](APInt &I) { return I.isSignMask(); }))
36976 return peekThroughBitcasts(Op.getOperand(0));
36981 /// Do target-specific dag combines on floating point negations.
36982 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36983 const X86Subtarget &Subtarget) {
36984 EVT OrigVT = N->getValueType(0);
36985 SDValue Arg = isFNEG(N);
36986 assert(Arg.getNode() && "N is expected to be an FNEG node");
36988 EVT VT = Arg.getValueType();
36989 EVT SVT = VT.getScalarType();
36992 // Let legalize expand this if it isn't a legal type yet.
36993 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36996 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36997 // use of a constant by performing (-0 - A*B) instead.
36998 // FIXME: Check rounding control flags as well once it becomes available.
36999 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
37000 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
37001 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
37002 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
37003 Arg.getOperand(1), Zero);
37004 return DAG.getBitcast(OrigVT, NewNode);
37007 // If we're negating an FMA node, then we can adjust the
37008 // instruction to include the extra negation.
37009 unsigned NewOpcode = 0;
37010 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
37011 switch (Arg.getOpcode()) {
37012 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
37013 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
37014 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
37015 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
37016 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
37017 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
37018 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
37019 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
37020 // We can't handle scalar intrinsic node here because it would only
37021 // invert one element and not the whole vector. But we could try to handle
37022 // a negation of the lower element only.
37026 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
37027 Arg.getNode()->ops()));
37032 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
37033 const X86Subtarget &Subtarget) {
37034 MVT VT = N->getSimpleValueType(0);
37035 // If we have integer vector types available, use the integer opcodes.
37036 if (VT.isVector() && Subtarget.hasSSE2()) {
37039 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
37041 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
37042 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
37043 unsigned IntOpcode;
37044 switch (N->getOpcode()) {
37045 default: llvm_unreachable("Unexpected FP logic op");
37046 case X86ISD::FOR: IntOpcode = ISD::OR; break;
37047 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
37048 case X86ISD::FAND: IntOpcode = ISD::AND; break;
37049 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
37051 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
37052 return DAG.getBitcast(VT, IntOp);
37058 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
37059 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
37060 if (N->getOpcode() != ISD::XOR)
37063 SDValue LHS = N->getOperand(0);
37064 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
37065 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
37068 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
37069 X86::CondCode(LHS->getConstantOperandVal(0)));
37071 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
37074 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
37075 TargetLowering::DAGCombinerInfo &DCI,
37076 const X86Subtarget &Subtarget) {
37077 // If this is SSE1 only convert to FXOR to avoid scalarization.
37078 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
37079 N->getValueType(0) == MVT::v4i32) {
37080 return DAG.getBitcast(
37081 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
37082 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
37083 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
37086 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
37089 if (DCI.isBeforeLegalizeOps())
37092 if (SDValue SetCC = foldXor1SetCC(N, DAG))
37095 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
37098 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
37102 return combineFneg(N, DAG, Subtarget);
37106 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
37107 TargetLowering::DAGCombinerInfo &DCI,
37108 const X86Subtarget &Subtarget) {
37109 SDValue Op0 = N->getOperand(0);
37110 SDValue Op1 = N->getOperand(1);
37111 EVT VT = N->getValueType(0);
37112 unsigned NumBits = VT.getSizeInBits();
37114 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37115 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
37116 !DCI.isBeforeLegalizeOps());
37118 // TODO - Constant Folding.
37119 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37120 // Reduce Cst1 to the bottom 16-bits.
37121 // NOTE: SimplifyDemandedBits won't do this for constants.
37122 const APInt &Val1 = Cst1->getAPIntValue();
37123 APInt MaskedVal1 = Val1 & 0xFFFF;
37124 if (MaskedVal1 != Val1)
37125 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
37126 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
37129 // Only bottom 16-bits of the control bits are required.
37131 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
37132 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
37133 DCI.CommitTargetLoweringOpt(TLO);
37134 return SDValue(N, 0);
37140 static bool isNullFPScalarOrVectorConst(SDValue V) {
37141 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
37144 /// If a value is a scalar FP zero or a vector FP zero (potentially including
37145 /// undefined elements), return a zero constant that may be used to fold away
37146 /// that value. In the case of a vector, the returned constant will not contain
37147 /// undefined elements even if the input parameter does. This makes it suitable
37148 /// to be used as a replacement operand with operations (eg, bitwise-and) where
37149 /// an undef should not propagate.
37150 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
37151 const X86Subtarget &Subtarget) {
37152 if (!isNullFPScalarOrVectorConst(V))
37155 if (V.getValueType().isVector())
37156 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
37161 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
37162 const X86Subtarget &Subtarget) {
37163 SDValue N0 = N->getOperand(0);
37164 SDValue N1 = N->getOperand(1);
37165 EVT VT = N->getValueType(0);
37168 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
37169 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
37170 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
37171 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
37174 auto isAllOnesConstantFP = [](SDValue V) {
37175 if (V.getSimpleValueType().isVector())
37176 return ISD::isBuildVectorAllOnes(V.getNode());
37177 auto *C = dyn_cast<ConstantFPSDNode>(V);
37178 return C && C->getConstantFPValue()->isAllOnesValue();
37181 // fand (fxor X, -1), Y --> fandn X, Y
37182 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
37183 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
37185 // fand X, (fxor Y, -1) --> fandn Y, X
37186 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
37187 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
37192 /// Do target-specific dag combines on X86ISD::FAND nodes.
37193 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
37194 const X86Subtarget &Subtarget) {
37195 // FAND(0.0, x) -> 0.0
37196 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
37199 // FAND(x, 0.0) -> 0.0
37200 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37203 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
37206 return lowerX86FPLogicOp(N, DAG, Subtarget);
37209 /// Do target-specific dag combines on X86ISD::FANDN nodes.
37210 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
37211 const X86Subtarget &Subtarget) {
37212 // FANDN(0.0, x) -> x
37213 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37214 return N->getOperand(1);
37216 // FANDN(x, 0.0) -> 0.0
37217 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37220 return lowerX86FPLogicOp(N, DAG, Subtarget);
37223 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
37224 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
37225 const X86Subtarget &Subtarget) {
37226 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
37228 // F[X]OR(0.0, x) -> x
37229 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37230 return N->getOperand(1);
37232 // F[X]OR(x, 0.0) -> x
37233 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
37234 return N->getOperand(0);
37237 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
37240 return lowerX86FPLogicOp(N, DAG, Subtarget);
37243 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
37244 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
37245 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
37247 // Only perform optimizations if UnsafeMath is used.
37248 if (!DAG.getTarget().Options.UnsafeFPMath)
37251 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
37252 // into FMINC and FMAXC, which are Commutative operations.
37253 unsigned NewOp = 0;
37254 switch (N->getOpcode()) {
37255 default: llvm_unreachable("unknown opcode");
37256 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
37257 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
37260 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
37261 N->getOperand(0), N->getOperand(1));
37264 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
37265 const X86Subtarget &Subtarget) {
37266 if (Subtarget.useSoftFloat())
37269 // TODO: If an operand is already known to be a NaN or not a NaN, this
37270 // should be an optional swap and FMAX/FMIN.
37272 EVT VT = N->getValueType(0);
37273 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
37274 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
37275 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
37278 SDValue Op0 = N->getOperand(0);
37279 SDValue Op1 = N->getOperand(1);
37281 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
37283 // If we don't have to respect NaN inputs, this is a direct translation to x86
37284 // min/max instructions.
37285 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
37286 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
37288 // If we have to respect NaN inputs, this takes at least 3 instructions.
37289 // Favor a library call when operating on a scalar and minimizing code size.
37290 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
37293 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
37294 DAG.getDataLayout(), *DAG.getContext(), VT);
37296 // There are 4 possibilities involving NaN inputs, and these are the required
37300 // ----------------
37301 // Num | Max | Op0 |
37302 // Op0 ----------------
37303 // NaN | Op1 | NaN |
37304 // ----------------
37306 // The SSE FP max/min instructions were not designed for this case, but rather
37308 // Min = Op1 < Op0 ? Op1 : Op0
37309 // Max = Op1 > Op0 ? Op1 : Op0
37311 // So they always return Op0 if either input is a NaN. However, we can still
37312 // use those instructions for fmaxnum by selecting away a NaN input.
37314 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
37315 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
37316 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
37318 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
37319 // are NaN, the NaN value of Op1 is the result.
37320 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
37323 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
37324 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
37325 TargetLowering::DAGCombinerInfo &DCI,
37326 const X86Subtarget &Subtarget) {
37327 // ANDNP(0, x) -> x
37328 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
37329 return N->getOperand(1);
37331 // ANDNP(x, 0) -> 0
37332 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
37333 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
37335 EVT VT = N->getValueType(0);
37337 // Attempt to recursively combine a bitmask ANDNP with shuffles.
37338 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
37340 if (SDValue Res = combineX86ShufflesRecursively(
37341 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
37342 /*HasVarMask*/ false, DAG, Subtarget))
37349 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
37350 TargetLowering::DAGCombinerInfo &DCI) {
37351 SDValue N0 = N->getOperand(0);
37352 SDValue N1 = N->getOperand(1);
37354 // BT ignores high bits in the bit index operand.
37355 unsigned BitWidth = N1.getValueSizeInBits();
37356 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
37357 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
37358 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
37363 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
37364 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
37365 EVT VT = N->getValueType(0);
37367 SDValue N0 = N->getOperand(0);
37368 SDValue N1 = N->getOperand(1);
37369 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37371 if (ExtraVT != MVT::i16)
37374 // Look through single use any_extends.
37375 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
37376 N0 = N0.getOperand(0);
37378 // See if we have a single use cmov.
37379 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
37382 SDValue CMovOp0 = N0.getOperand(0);
37383 SDValue CMovOp1 = N0.getOperand(1);
37385 // Make sure both operands are constants.
37386 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37387 !isa<ConstantSDNode>(CMovOp1.getNode()))
37392 // If we looked through an any_extend above, add one to the constants.
37393 if (N0.getValueType() != VT) {
37394 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
37395 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
37398 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
37399 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
37401 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
37402 N0.getOperand(2), N0.getOperand(3));
37405 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
37406 const X86Subtarget &Subtarget) {
37407 if (SDValue V = combineSextInRegCmov(N, DAG))
37410 EVT VT = N->getValueType(0);
37411 SDValue N0 = N->getOperand(0);
37412 SDValue N1 = N->getOperand(1);
37413 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37416 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
37417 // both SSE and AVX2 since there is no sign-extended shift right
37418 // operation on a vector with 64-bit elements.
37419 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
37420 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
37421 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
37422 N0.getOpcode() == ISD::SIGN_EXTEND)) {
37423 SDValue N00 = N0.getOperand(0);
37425 // EXTLOAD has a better solution on AVX2,
37426 // it may be replaced with X86ISD::VSEXT node.
37427 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
37428 if (!ISD::isNormalLoad(N00.getNode()))
37431 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
37432 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
37434 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
37440 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
37441 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
37442 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
37443 /// opportunities to combine math ops, use an LEA, or use a complex addressing
37444 /// mode. This can eliminate extend, add, and shift instructions.
37445 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
37446 const X86Subtarget &Subtarget) {
37447 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
37448 Ext->getOpcode() != ISD::ZERO_EXTEND)
37451 // TODO: This should be valid for other integer types.
37452 EVT VT = Ext->getValueType(0);
37453 if (VT != MVT::i64)
37456 SDValue Add = Ext->getOperand(0);
37457 if (Add.getOpcode() != ISD::ADD)
37460 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
37461 bool NSW = Add->getFlags().hasNoSignedWrap();
37462 bool NUW = Add->getFlags().hasNoUnsignedWrap();
37464 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
37466 if ((Sext && !NSW) || (!Sext && !NUW))
37469 // Having a constant operand to the 'add' ensures that we are not increasing
37470 // the instruction count because the constant is extended for free below.
37471 // A constant operand can also become the displacement field of an LEA.
37472 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
37476 // Don't make the 'add' bigger if there's no hope of combining it with some
37477 // other 'add' or 'shl' instruction.
37478 // TODO: It may be profitable to generate simpler LEA instructions in place
37479 // of single 'add' instructions, but the cost model for selecting an LEA
37480 // currently has a high threshold.
37481 bool HasLEAPotential = false;
37482 for (auto *User : Ext->uses()) {
37483 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
37484 HasLEAPotential = true;
37488 if (!HasLEAPotential)
37491 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
37492 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
37493 SDValue AddOp0 = Add.getOperand(0);
37494 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
37495 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
37497 // The wider add is guaranteed to not wrap because both operands are
37500 Flags.setNoSignedWrap(NSW);
37501 Flags.setNoUnsignedWrap(NUW);
37502 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
37505 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
37506 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
37507 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
37508 /// extends from AH (which we otherwise need to do contortions to access).
37509 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
37510 SDValue N0 = N->getOperand(0);
37511 auto OpcodeN = N->getOpcode();
37512 auto OpcodeN0 = N0.getOpcode();
37513 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
37514 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
37517 EVT VT = N->getValueType(0);
37518 EVT InVT = N0.getValueType();
37519 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
37520 !(VT == MVT::i32 || VT == MVT::i64))
37523 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
37524 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
37525 : X86ISD::UDIVREM8_ZEXT_HREG;
37526 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
37528 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
37529 // If this was a 64-bit extend, complete it.
37530 if (VT == MVT::i64)
37531 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
37532 return R.getValue(1);
37535 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
37536 // operands and the result of CMOV is not used anywhere else - promote CMOV
37537 // itself instead of promoting its result. This could be beneficial, because:
37538 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
37539 // (or more) pseudo-CMOVs only when they go one-after-another and
37540 // getting rid of result extension code after CMOV will help that.
37541 // 2) Promotion of constant CMOV arguments is free, hence the
37542 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
37543 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
37544 // promotion is also good in terms of code-size.
37545 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
37547 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
37548 SDValue CMovN = Extend->getOperand(0);
37549 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
37552 EVT TargetVT = Extend->getValueType(0);
37553 unsigned ExtendOpcode = Extend->getOpcode();
37556 EVT VT = CMovN.getValueType();
37557 SDValue CMovOp0 = CMovN.getOperand(0);
37558 SDValue CMovOp1 = CMovN.getOperand(1);
37560 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37561 !isa<ConstantSDNode>(CMovOp1.getNode()))
37564 // Only extend to i32 or i64.
37565 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
37568 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
37570 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
37573 // If this a zero extend to i64, we should only extend to i32 and use a free
37574 // zero extend to finish.
37575 EVT ExtendVT = TargetVT;
37576 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
37577 ExtendVT = MVT::i32;
37579 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
37580 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
37582 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
37583 CMovN.getOperand(2), CMovN.getOperand(3));
37585 // Finish extending if needed.
37586 if (ExtendVT != TargetVT)
37587 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
37592 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
37593 // This is more or less the reverse of combineBitcastvxi1.
37595 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
37596 TargetLowering::DAGCombinerInfo &DCI,
37597 const X86Subtarget &Subtarget) {
37598 unsigned Opcode = N->getOpcode();
37599 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
37600 Opcode != ISD::ANY_EXTEND)
37602 if (!DCI.isBeforeLegalizeOps())
37604 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
37607 SDValue N0 = N->getOperand(0);
37608 EVT VT = N->getValueType(0);
37609 EVT SVT = VT.getScalarType();
37610 EVT InSVT = N0.getValueType().getScalarType();
37611 unsigned EltSizeInBits = SVT.getSizeInBits();
37613 // Input type must be extending a bool vector (bit-casted from a scalar
37614 // integer) to legal integer types.
37615 if (!VT.isVector())
37617 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
37619 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
37622 SDValue N00 = N0.getOperand(0);
37623 EVT SclVT = N0.getOperand(0).getValueType();
37624 if (!SclVT.isScalarInteger())
37629 SmallVector<int, 32> ShuffleMask;
37630 unsigned NumElts = VT.getVectorNumElements();
37631 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
37633 // Broadcast the scalar integer to the vector elements.
37634 if (NumElts > EltSizeInBits) {
37635 // If the scalar integer is greater than the vector element size, then we
37636 // must split it down into sub-sections for broadcasting. For example:
37637 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
37638 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
37639 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
37640 unsigned Scale = NumElts / EltSizeInBits;
37642 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
37643 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
37644 Vec = DAG.getBitcast(VT, Vec);
37646 for (unsigned i = 0; i != Scale; ++i)
37647 ShuffleMask.append(EltSizeInBits, i);
37649 // For smaller scalar integers, we can simply any-extend it to the vector
37650 // element size (we don't care about the upper bits) and broadcast it to all
37652 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
37653 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37654 ShuffleMask.append(NumElts, 0);
37656 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
37658 // Now, mask the relevant bit in each element.
37659 SmallVector<SDValue, 32> Bits;
37660 for (unsigned i = 0; i != NumElts; ++i) {
37661 int BitIdx = (i % EltSizeInBits);
37662 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
37663 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
37665 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
37666 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
37668 // Compare against the bitmask and extend the result.
37669 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
37670 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
37671 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
37673 // For SEXT, this is now done, otherwise shift the result down for
37675 if (Opcode == ISD::SIGN_EXTEND)
37677 return DAG.getNode(ISD::SRL, DL, VT, Vec,
37678 DAG.getConstant(EltSizeInBits - 1, DL, VT));
37681 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
37682 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
37683 /// with UNDEFs) of the input to vectors of the same size as the target type
37684 /// which then extends the lowest elements.
37685 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
37686 TargetLowering::DAGCombinerInfo &DCI,
37687 const X86Subtarget &Subtarget) {
37688 unsigned Opcode = N->getOpcode();
37689 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
37691 if (!DCI.isBeforeLegalizeOps())
37693 if (!Subtarget.hasSSE2())
37696 SDValue N0 = N->getOperand(0);
37697 EVT VT = N->getValueType(0);
37698 EVT SVT = VT.getScalarType();
37699 EVT InVT = N0.getValueType();
37700 EVT InSVT = InVT.getScalarType();
37702 // Input type must be a vector and we must be extending legal integer types.
37703 if (!VT.isVector())
37705 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
37707 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
37710 // On AVX2+ targets, if the input/output types are both legal then we will be
37711 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
37712 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
37713 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
37718 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
37719 EVT InVT = N.getValueType();
37720 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
37721 Size / InVT.getScalarSizeInBits());
37722 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
37723 DAG.getUNDEF(InVT));
37725 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
37728 // If target-size is less than 128-bits, extend to a type that would extend
37729 // to 128 bits, extend that and extract the original target vector.
37730 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
37731 unsigned Scale = 128 / VT.getSizeInBits();
37733 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
37734 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
37735 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
37736 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
37737 DAG.getIntPtrConstant(0, DL));
37740 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
37741 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
37742 // Also use this if we don't have SSE41 to allow the legalizer do its job.
37743 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
37744 (VT.is256BitVector() && Subtarget.hasInt256()) ||
37745 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
37746 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
37747 return Opcode == ISD::SIGN_EXTEND
37748 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
37749 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
37752 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
37753 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
37754 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
37755 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
37756 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
37758 SmallVector<SDValue, 8> Opnds;
37759 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
37760 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37761 DAG.getIntPtrConstant(Offset, DL));
37762 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
37763 SrcVec = Opcode == ISD::SIGN_EXTEND
37764 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
37765 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
37766 Opnds.push_back(SrcVec);
37768 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
37771 // On pre-AVX2 targets, split into 128-bit nodes of
37772 // ISD::*_EXTEND_VECTOR_INREG.
37773 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
37774 return SplitAndExtendInReg(128);
37776 // On pre-AVX512 targets, split into 256-bit nodes of
37777 // ISD::*_EXTEND_VECTOR_INREG.
37778 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
37779 return SplitAndExtendInReg(256);
37784 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
37786 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
37787 const X86Subtarget &Subtarget) {
37788 SDValue N0 = N->getOperand(0);
37789 EVT VT = N->getValueType(0);
37792 // Only do this combine with AVX512 for vector extends.
37793 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
37796 // Only combine legal element types.
37797 EVT SVT = VT.getVectorElementType();
37798 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
37799 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
37802 // We can only do this if the vector size in 256 bits or less.
37803 unsigned Size = VT.getSizeInBits();
37807 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
37808 // that's the only integer compares with we have.
37809 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
37810 if (ISD::isUnsignedIntSetCC(CC))
37813 // Only do this combine if the extension will be fully consumed by the setcc.
37814 EVT N00VT = N0.getOperand(0).getValueType();
37815 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
37816 if (Size != MatchingVecType.getSizeInBits())
37819 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
37821 if (N->getOpcode() == ISD::ZERO_EXTEND)
37822 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
37827 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
37828 TargetLowering::DAGCombinerInfo &DCI,
37829 const X86Subtarget &Subtarget) {
37830 SDValue N0 = N->getOperand(0);
37831 EVT VT = N->getValueType(0);
37832 EVT InVT = N0.getValueType();
37835 if (SDValue DivRem8 = getDivRem8(N, DAG))
37838 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37841 if (!DCI.isBeforeLegalizeOps())
37844 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37847 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
37848 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
37849 // Invert and sign-extend a boolean is the same as zero-extend and subtract
37850 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
37851 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
37852 // sext (xor Bool, -1) --> sub (zext Bool), 1
37853 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
37854 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
37857 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37860 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37864 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37867 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37873 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
37876 default: llvm_unreachable("Unexpected opcode");
37877 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
37878 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
37879 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
37880 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
37881 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
37882 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
37883 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
37884 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
37890 default: llvm_unreachable("Unexpected opcode");
37891 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
37892 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
37893 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
37894 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
37895 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
37896 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
37897 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
37898 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
37905 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
37906 const X86Subtarget &Subtarget) {
37908 EVT VT = N->getValueType(0);
37910 // Let legalize expand this if it isn't a legal type yet.
37911 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
37914 EVT ScalarVT = VT.getScalarType();
37915 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
37918 SDValue A = N->getOperand(0);
37919 SDValue B = N->getOperand(1);
37920 SDValue C = N->getOperand(2);
37922 auto invertIfNegative = [&DAG](SDValue &V) {
37923 if (SDValue NegVal = isFNEG(V.getNode())) {
37924 V = DAG.getBitcast(V.getValueType(), NegVal);
37927 // Look through extract_vector_elts. If it comes from an FNEG, create a
37928 // new extract from the FNEG input.
37929 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37930 isNullConstant(V.getOperand(1))) {
37931 if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
37932 NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
37933 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
37934 NegVal, V.getOperand(1));
37942 // Do not convert the passthru input of scalar intrinsics.
37943 // FIXME: We could allow negations of the lower element only.
37944 bool NegA = invertIfNegative(A);
37945 bool NegB = invertIfNegative(B);
37946 bool NegC = invertIfNegative(C);
37948 if (!NegA && !NegB && !NegC)
37951 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
37953 if (N->getNumOperands() == 4)
37954 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37955 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37958 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37959 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37960 const X86Subtarget &Subtarget) {
37962 EVT VT = N->getValueType(0);
37964 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37968 unsigned NewOpcode;
37969 switch (N->getOpcode()) {
37970 default: llvm_unreachable("Unexpected opcode!");
37971 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37972 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37973 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37974 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37977 if (N->getNumOperands() == 4)
37978 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37979 NegVal, N->getOperand(3));
37980 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37984 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37985 TargetLowering::DAGCombinerInfo &DCI,
37986 const X86Subtarget &Subtarget) {
37987 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37988 // (and (i32 x86isd::setcc_carry), 1)
37989 // This eliminates the zext. This transformation is necessary because
37990 // ISD::SETCC is always legalized to i8.
37992 SDValue N0 = N->getOperand(0);
37993 EVT VT = N->getValueType(0);
37995 if (N0.getOpcode() == ISD::AND &&
37997 N0.getOperand(0).hasOneUse()) {
37998 SDValue N00 = N0.getOperand(0);
37999 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
38000 if (!isOneConstant(N0.getOperand(1)))
38002 return DAG.getNode(ISD::AND, dl, VT,
38003 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
38004 N00.getOperand(0), N00.getOperand(1)),
38005 DAG.getConstant(1, dl, VT));
38009 if (N0.getOpcode() == ISD::TRUNCATE &&
38011 N0.getOperand(0).hasOneUse()) {
38012 SDValue N00 = N0.getOperand(0);
38013 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
38014 return DAG.getNode(ISD::AND, dl, VT,
38015 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
38016 N00.getOperand(0), N00.getOperand(1)),
38017 DAG.getConstant(1, dl, VT));
38021 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
38024 if (DCI.isBeforeLegalizeOps())
38025 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
38028 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
38031 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
38035 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
38038 if (SDValue DivRem8 = getDivRem8(N, DAG))
38041 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
38044 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
38050 /// Try to map a 128-bit or larger integer comparison to vector instructions
38051 /// before type legalization splits it up into chunks.
38052 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
38053 const X86Subtarget &Subtarget) {
38054 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
38055 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
38057 // We're looking for an oversized integer equality comparison.
38058 SDValue X = SetCC->getOperand(0);
38059 SDValue Y = SetCC->getOperand(1);
38060 EVT OpVT = X.getValueType();
38061 unsigned OpSize = OpVT.getSizeInBits();
38062 if (!OpVT.isScalarInteger() || OpSize < 128)
38065 // Ignore a comparison with zero because that gets special treatment in
38066 // EmitTest(). But make an exception for the special case of a pair of
38067 // logically-combined vector-sized operands compared to zero. This pattern may
38068 // be generated by the memcmp expansion pass with oversized integer compares
38070 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
38071 X.getOperand(0).getOpcode() == ISD::XOR &&
38072 X.getOperand(1).getOpcode() == ISD::XOR;
38073 if (isNullConstant(Y) && !IsOrXorXorCCZero)
38076 // Bail out if we know that this is not really just an oversized integer.
38077 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
38078 peekThroughBitcasts(Y).getValueType() == MVT::f128)
38081 // TODO: Use PXOR + PTEST for SSE4.1 or later?
38082 // TODO: Add support for AVX-512.
38083 EVT VT = SetCC->getValueType(0);
38085 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
38086 (OpSize == 256 && Subtarget.hasAVX2())) {
38087 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
38089 if (IsOrXorXorCCZero) {
38090 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
38091 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
38092 // Use 2 vector equality compares and 'and' the results before doing a
38094 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
38095 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
38096 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
38097 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
38098 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
38099 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
38100 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
38102 SDValue VecX = DAG.getBitcast(VecVT, X);
38103 SDValue VecY = DAG.getBitcast(VecVT, Y);
38104 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
38106 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
38107 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
38108 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
38109 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
38110 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
38111 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
38112 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
38114 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
38120 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
38121 const X86Subtarget &Subtarget) {
38122 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
38123 SDValue LHS = N->getOperand(0);
38124 SDValue RHS = N->getOperand(1);
38125 EVT VT = N->getValueType(0);
38126 EVT OpVT = LHS.getValueType();
38129 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
38130 // 0-x == y --> x+y == 0
38131 // 0-x != y --> x+y != 0
38132 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
38134 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
38135 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38137 // x == 0-y --> x+y == 0
38138 // x != 0-y --> x+y != 0
38139 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
38141 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
38142 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38145 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
38149 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
38150 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
38151 // Put build_vectors on the right.
38152 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
38153 std::swap(LHS, RHS);
38154 CC = ISD::getSetCCSwappedOperands(CC);
38158 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
38159 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
38160 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
38162 if (IsSEXT0 && IsVZero1) {
38163 assert(VT == LHS.getOperand(0).getValueType() &&
38164 "Uexpected operand type");
38165 if (CC == ISD::SETGT)
38166 return DAG.getConstant(0, DL, VT);
38167 if (CC == ISD::SETLE)
38168 return DAG.getConstant(1, DL, VT);
38169 if (CC == ISD::SETEQ || CC == ISD::SETGE)
38170 return DAG.getNOT(DL, LHS.getOperand(0), VT);
38172 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
38173 "Unexpected condition code!");
38174 return LHS.getOperand(0);
38178 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
38179 // pre-promote its result type since vXi1 vectors don't get promoted
38180 // during type legalization.
38181 // NOTE: The element count check is to ignore operand types that need to
38182 // go through type promotion to a 128-bit vector.
38183 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
38184 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
38185 (OpVT.getVectorElementType() == MVT::i8 ||
38186 OpVT.getVectorElementType() == MVT::i16)) {
38187 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
38189 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
38192 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
38193 // to avoid scalarization via legalization because v4i32 is not a legal type.
38194 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
38195 LHS.getValueType() == MVT::v4f32)
38196 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
38201 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
38202 TargetLowering::DAGCombinerInfo &DCI) {
38203 SDValue Src = N->getOperand(0);
38204 MVT SrcVT = Src.getSimpleValueType();
38206 // Perform constant folding.
38207 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
38208 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
38210 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
38211 SDValue In = Src.getOperand(Idx);
38212 if (!In.isUndef() &&
38213 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
38216 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
38219 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38220 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38221 !DCI.isBeforeLegalizeOps());
38223 // MOVMSK only uses the MSB from each vector element.
38225 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
38226 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
38227 DCI.AddToWorklist(Src.getNode());
38228 DCI.CommitTargetLoweringOpt(TLO);
38229 return SDValue(N, 0);
38235 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
38236 TargetLowering::DAGCombinerInfo &DCI,
38237 const X86Subtarget &Subtarget) {
38240 if (DCI.isBeforeLegalizeOps()) {
38241 SDValue Index = N->getOperand(4);
38242 // Remove any sign extends from 32 or smaller to larger than 32.
38243 // Only do this before LegalizeOps in case we need the sign extend for
38245 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
38246 if (Index.getScalarValueSizeInBits() > 32 &&
38247 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
38248 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38249 NewOps[4] = Index.getOperand(0);
38250 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38252 // The original sign extend has less users, add back to worklist in
38253 // case it needs to be removed
38254 DCI.AddToWorklist(Index.getNode());
38255 DCI.AddToWorklist(N);
38257 return SDValue(Res, 0);
38261 // Make sure the index is either i32 or i64
38262 unsigned ScalarSize = Index.getScalarValueSizeInBits();
38263 if (ScalarSize != 32 && ScalarSize != 64) {
38264 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
38265 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
38266 Index.getValueType().getVectorNumElements());
38267 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
38268 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38270 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38272 DCI.AddToWorklist(N);
38273 return SDValue(Res, 0);
38276 // Try to remove zero extends from 32->64 if we know the sign bit of
38277 // the input is zero.
38278 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
38279 Index.getScalarValueSizeInBits() == 64 &&
38280 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
38281 if (DAG.SignBitIsZero(Index.getOperand(0))) {
38282 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38283 NewOps[4] = Index.getOperand(0);
38284 SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
38286 // The original sign extend has less users, add back to worklist in
38287 // case it needs to be removed
38288 DCI.AddToWorklist(Index.getNode());
38289 DCI.AddToWorklist(N);
38291 return SDValue(Res, 0);
38296 // With AVX2 we only demand the upper bit of the mask.
38297 if (!Subtarget.hasAVX512()) {
38298 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38299 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38300 !DCI.isBeforeLegalizeOps());
38301 SDValue Mask = N->getOperand(2);
38303 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
38304 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
38305 DCI.AddToWorklist(Mask.getNode());
38306 DCI.CommitTargetLoweringOpt(TLO);
38307 return SDValue(N, 0);
38314 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
38315 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
38316 const X86Subtarget &Subtarget) {
38318 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
38319 SDValue EFLAGS = N->getOperand(1);
38321 // Try to simplify the EFLAGS and condition code operands.
38322 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
38323 return getSETCC(CC, Flags, DL, DAG);
38328 /// Optimize branch condition evaluation.
38329 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
38330 const X86Subtarget &Subtarget) {
38332 SDValue EFLAGS = N->getOperand(3);
38333 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
38335 // Try to simplify the EFLAGS and condition code operands.
38336 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
38337 // RAUW them under us.
38338 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
38339 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
38340 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
38341 N->getOperand(1), Cond, Flags);
38347 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
38348 SelectionDAG &DAG) {
38349 // Take advantage of vector comparisons producing 0 or -1 in each lane to
38350 // optimize away operation when it's from a constant.
38352 // The general transformation is:
38353 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
38354 // AND(VECTOR_CMP(x,y), constant2)
38355 // constant2 = UNARYOP(constant)
38357 // Early exit if this isn't a vector operation, the operand of the
38358 // unary operation isn't a bitwise AND, or if the sizes of the operations
38359 // aren't the same.
38360 EVT VT = N->getValueType(0);
38361 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
38362 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
38363 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
38366 // Now check that the other operand of the AND is a constant. We could
38367 // make the transformation for non-constant splats as well, but it's unclear
38368 // that would be a benefit as it would not eliminate any operations, just
38369 // perform one more step in scalar code before moving to the vector unit.
38370 if (BuildVectorSDNode *BV =
38371 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
38372 // Bail out if the vector isn't a constant.
38373 if (!BV->isConstant())
38376 // Everything checks out. Build up the new and improved node.
38378 EVT IntVT = BV->getValueType(0);
38379 // Create a new constant of the appropriate type for the transformed
38381 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
38382 // The AND node needs bitcasts to/from an integer vector type around it.
38383 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
38384 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
38385 N->getOperand(0)->getOperand(0), MaskConst);
38386 SDValue Res = DAG.getBitcast(VT, NewAnd);
38393 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
38394 const X86Subtarget &Subtarget) {
38395 SDValue Op0 = N->getOperand(0);
38396 EVT VT = N->getValueType(0);
38397 EVT InVT = Op0.getValueType();
38399 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38400 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
38401 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
38402 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38404 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38405 InVT.getVectorNumElements());
38406 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
38408 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
38409 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38412 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
38413 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
38414 // the optimization here.
38415 if (DAG.SignBitIsZero(Op0))
38416 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
38421 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
38422 const X86Subtarget &Subtarget) {
38423 // First try to optimize away the conversion entirely when it's
38424 // conditionally from a constant. Vectors only.
38425 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
38428 // Now move on to more general possibilities.
38429 SDValue Op0 = N->getOperand(0);
38430 EVT VT = N->getValueType(0);
38431 EVT InVT = Op0.getValueType();
38433 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38434 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
38435 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
38436 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38438 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38439 InVT.getVectorNumElements());
38440 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
38441 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38444 // Without AVX512DQ we only support i64 to float scalar conversion. For both
38445 // vectors and scalars, see if we know that the upper bits are all the sign
38446 // bit, in which case we can truncate the input to i32 and convert from that.
38447 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
38448 unsigned BitWidth = InVT.getScalarSizeInBits();
38449 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
38450 if (NumSignBits >= (BitWidth - 31)) {
38451 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
38452 if (InVT.isVector())
38453 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
38454 InVT.getVectorNumElements());
38456 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
38457 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
38461 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
38462 // a 32-bit target where SSE doesn't support i64->FP operations.
38463 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
38464 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
38465 EVT LdVT = Ld->getValueType(0);
38467 // This transformation is not supported if the result type is f16 or f128.
38468 if (VT == MVT::f16 || VT == MVT::f128)
38471 // If we have AVX512DQ we can use packed conversion instructions unless
38473 if (Subtarget.hasDQI() && VT != MVT::f80)
38476 if (!Ld->isVolatile() && !VT.isVector() &&
38477 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
38478 !Subtarget.is64Bit() && LdVT == MVT::i64) {
38479 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
38480 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
38481 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
38488 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
38489 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38490 MVT VT = N->getSimpleValueType(0);
38491 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38492 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
38493 N->getOperand(0), N->getOperand(1),
38500 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
38501 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
38502 TargetLowering::DAGCombinerInfo &DCI) {
38503 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
38504 // the result is either zero or one (depending on the input carry bit).
38505 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
38506 if (X86::isZeroNode(N->getOperand(0)) &&
38507 X86::isZeroNode(N->getOperand(1)) &&
38508 // We don't have a good way to replace an EFLAGS use, so only do this when
38510 SDValue(N, 1).use_empty()) {
38512 EVT VT = N->getValueType(0);
38513 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
38514 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
38515 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38516 DAG.getConstant(X86::COND_B, DL,
38519 DAG.getConstant(1, DL, VT));
38520 return DCI.CombineTo(N, Res1, CarryOut);
38523 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38524 MVT VT = N->getSimpleValueType(0);
38525 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38526 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
38527 N->getOperand(0), N->getOperand(1),
38534 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
38535 /// which is more useful than 0/1 in some cases.
38536 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
38538 // "Condition code B" is also known as "the carry flag" (CF).
38539 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
38540 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
38541 MVT VT = N->getSimpleValueType(0);
38543 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
38545 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
38546 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
38549 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
38550 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
38551 /// with CMP+{ADC, SBB}.
38552 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
38553 bool IsSub = N->getOpcode() == ISD::SUB;
38554 SDValue X = N->getOperand(0);
38555 SDValue Y = N->getOperand(1);
38557 // If this is an add, canonicalize a zext operand to the RHS.
38558 // TODO: Incomplete? What if both sides are zexts?
38559 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
38560 Y.getOpcode() != ISD::ZERO_EXTEND)
38563 // Look through a one-use zext.
38564 bool PeekedThroughZext = false;
38565 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
38566 Y = Y.getOperand(0);
38567 PeekedThroughZext = true;
38570 // If this is an add, canonicalize a setcc operand to the RHS.
38571 // TODO: Incomplete? What if both sides are setcc?
38572 // TODO: Should we allow peeking through a zext of the other operand?
38573 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
38574 Y.getOpcode() != X86ISD::SETCC)
38577 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
38581 EVT VT = N->getValueType(0);
38582 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
38584 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38585 // the general case below.
38586 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
38588 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
38589 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
38590 // This is a complicated way to get -1 or 0 from the carry flag:
38591 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38592 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38593 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38594 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38598 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
38599 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
38600 SDValue EFLAGS = Y->getOperand(1);
38601 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38602 EFLAGS.getValueType().isInteger() &&
38603 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38604 // Swap the operands of a SUB, and we have the same pattern as above.
38605 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
38606 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
38607 SDValue NewSub = DAG.getNode(
38608 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
38609 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38610 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38611 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38612 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38618 if (CC == X86::COND_B) {
38619 // X + SETB Z --> X + (mask SBB Z, Z)
38620 // X - SETB Z --> X - (mask SBB Z, Z)
38621 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
38622 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
38623 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38624 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38625 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38628 if (CC == X86::COND_A) {
38629 SDValue EFLAGS = Y->getOperand(1);
38630 // Try to convert COND_A into COND_B in an attempt to facilitate
38631 // materializing "setb reg".
38633 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
38634 // cannot take an immediate as its first operand.
38636 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38637 EFLAGS.getValueType().isInteger() &&
38638 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38639 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
38640 EFLAGS.getNode()->getVTList(),
38641 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38642 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38643 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
38644 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38645 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38646 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38650 if (CC != X86::COND_E && CC != X86::COND_NE)
38653 SDValue Cmp = Y.getOperand(1);
38654 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
38655 !X86::isZeroNode(Cmp.getOperand(1)) ||
38656 !Cmp.getOperand(0).getValueType().isInteger())
38659 SDValue Z = Cmp.getOperand(0);
38660 EVT ZVT = Z.getValueType();
38662 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38663 // the general case below.
38665 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
38667 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
38668 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
38669 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
38670 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
38671 SDValue Zero = DAG.getConstant(0, DL, ZVT);
38672 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
38673 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
38674 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38675 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38676 SDValue(Neg.getNode(), 1));
38679 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
38680 // with fake operands:
38681 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
38682 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
38683 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
38684 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
38685 SDValue One = DAG.getConstant(1, DL, ZVT);
38686 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38687 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38688 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
38692 // (cmp Z, 1) sets the carry flag if Z is 0.
38693 SDValue One = DAG.getConstant(1, DL, ZVT);
38694 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38696 // Add the flags type for ADC/SBB nodes.
38697 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38699 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
38700 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
38701 if (CC == X86::COND_NE)
38702 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
38703 DAG.getConstant(-1ULL, DL, VT), Cmp1);
38705 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
38706 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
38707 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
38708 DAG.getConstant(0, DL, VT), Cmp1);
38711 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
38712 const X86Subtarget &Subtarget) {
38713 if (!Subtarget.hasSSE2())
38716 SDValue MulOp = N->getOperand(0);
38717 SDValue Phi = N->getOperand(1);
38719 if (MulOp.getOpcode() != ISD::MUL)
38720 std::swap(MulOp, Phi);
38721 if (MulOp.getOpcode() != ISD::MUL)
38725 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
38728 EVT VT = N->getValueType(0);
38730 // If the vector size is less than 128, or greater than the supported RegSize,
38731 // do not use PMADD.
38732 if (VT.getVectorNumElements() < 8)
38736 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38737 VT.getVectorNumElements());
38738 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38739 VT.getVectorNumElements() / 2);
38741 // Shrink the operands of mul.
38742 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
38743 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
38745 // Madd vector size is half of the original vector size
38746 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38747 ArrayRef<SDValue> Ops) {
38748 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38749 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
38751 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
38753 // Fill the rest of the output with 0
38754 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
38755 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
38756 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
38759 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
38760 const X86Subtarget &Subtarget) {
38761 if (!Subtarget.hasSSE2())
38765 EVT VT = N->getValueType(0);
38766 SDValue Op0 = N->getOperand(0);
38767 SDValue Op1 = N->getOperand(1);
38769 // TODO: There's nothing special about i32, any integer type above i16 should
38770 // work just as well.
38771 if (!VT.isVector() || !VT.isSimple() ||
38772 !(VT.getVectorElementType() == MVT::i32))
38775 unsigned RegSize = 128;
38776 if (Subtarget.useBWIRegs())
38778 else if (Subtarget.hasAVX())
38781 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
38782 // TODO: We should be able to handle larger vectors by splitting them before
38783 // feeding them into several SADs, and then reducing over those.
38784 if (VT.getSizeInBits() / 4 > RegSize)
38787 // We know N is a reduction add, which means one of its operands is a phi.
38788 // To match SAD, we need the other operand to be a vector select.
38789 SDValue SelectOp, Phi;
38790 if (Op0.getOpcode() == ISD::VSELECT) {
38793 } else if (Op1.getOpcode() == ISD::VSELECT) {
38799 // Check whether we have an abs-diff pattern feeding into the select.
38800 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
38803 // SAD pattern detected. Now build a SAD instruction and an addition for
38804 // reduction. Note that the number of elements of the result of SAD is less
38805 // than the number of elements of its input. Therefore, we could only update
38806 // part of elements in the reduction vector.
38807 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
38809 // The output of PSADBW is a vector of i64.
38810 // We need to turn the vector of i64 into a vector of i32.
38811 // If the reduction vector is at least as wide as the psadbw result, just
38812 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
38814 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
38815 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
38816 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
38818 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
38820 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
38821 // Fill the upper elements with zero to match the add width.
38822 SDValue Zero = DAG.getConstant(0, DL, VT);
38823 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
38824 DAG.getIntPtrConstant(0, DL));
38827 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
38830 /// Convert vector increment or decrement to sub/add with an all-ones constant:
38831 /// add X, <1, 1...> --> sub X, <-1, -1...>
38832 /// sub X, <1, 1...> --> add X, <-1, -1...>
38833 /// The all-ones vector constant can be materialized using a pcmpeq instruction
38834 /// that is commonly recognized as an idiom (has no register dependency), so
38835 /// that's better/smaller than loading a splat 1 constant.
38836 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
38837 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
38838 "Unexpected opcode for increment/decrement transform");
38840 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
38841 // out and wait for legalization if we have an unsupported vector length.
38842 EVT VT = N->getValueType(0);
38843 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38846 SDNode *N1 = N->getOperand(1).getNode();
38848 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
38849 !SplatVal.isOneValue())
38852 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
38853 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
38854 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
38857 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
38858 const SDLoc &DL, EVT VT,
38859 const X86Subtarget &Subtarget) {
38860 // Example of pattern we try to detect:
38861 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
38862 //(add (build_vector (extract_elt t, 0),
38863 // (extract_elt t, 2),
38864 // (extract_elt t, 4),
38865 // (extract_elt t, 6)),
38866 // (build_vector (extract_elt t, 1),
38867 // (extract_elt t, 3),
38868 // (extract_elt t, 5),
38869 // (extract_elt t, 7)))
38871 if (!Subtarget.hasSSE2())
38874 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
38875 Op1.getOpcode() != ISD::BUILD_VECTOR)
38878 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38879 VT.getVectorNumElements() < 4 ||
38880 !isPowerOf2_32(VT.getVectorNumElements()))
38883 // Check if one of Op0,Op1 is of the form:
38884 // (build_vector (extract_elt Mul, 0),
38885 // (extract_elt Mul, 2),
38886 // (extract_elt Mul, 4),
38888 // the other is of the form:
38889 // (build_vector (extract_elt Mul, 1),
38890 // (extract_elt Mul, 3),
38891 // (extract_elt Mul, 5),
38893 // and identify Mul.
38895 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
38896 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
38897 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
38898 // TODO: Be more tolerant to undefs.
38899 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38900 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38901 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38902 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38904 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
38905 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38906 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38907 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38908 if (!Const0L || !Const1L || !Const0H || !Const1H)
38910 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38911 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38912 // Commutativity of mul allows factors of a product to reorder.
38914 std::swap(Idx0L, Idx1L);
38916 std::swap(Idx0H, Idx1H);
38917 // Commutativity of add allows pairs of factors to reorder.
38918 if (Idx0L > Idx0H) {
38919 std::swap(Idx0L, Idx0H);
38920 std::swap(Idx1L, Idx1H);
38922 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38923 Idx1H != 2 * i + 3)
38926 // First time an extract_elt's source vector is visited. Must be a MUL
38927 // with 2X number of vector elements than the BUILD_VECTOR.
38928 // Both extracts must be from same MUL.
38929 Mul = Op0L->getOperand(0);
38930 if (Mul->getOpcode() != ISD::MUL ||
38931 Mul.getValueType().getVectorNumElements() != 2 * e)
38934 // Check that the extract is from the same MUL previously seen.
38935 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38936 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38940 // Check if the Mul source can be safely shrunk.
38942 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38945 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38946 ArrayRef<SDValue> Ops) {
38947 // Shrink by adding truncate nodes and let DAGCombine fold with the
38949 EVT InVT = Ops[0].getValueType();
38950 assert(InVT.getScalarType() == MVT::i32 &&
38951 "Unexpected scalar element type");
38952 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38953 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38954 InVT.getVectorNumElements() / 2);
38955 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38956 InVT.getVectorNumElements());
38957 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38958 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38959 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38961 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38962 { Mul.getOperand(0), Mul.getOperand(1) },
38966 // Attempt to turn this pattern into PMADDWD.
38967 // (mul (add (zext (build_vector)), (zext (build_vector))),
38968 // (add (zext (build_vector)), (zext (build_vector)))
38969 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
38970 const SDLoc &DL, EVT VT,
38971 const X86Subtarget &Subtarget) {
38972 if (!Subtarget.hasSSE2())
38975 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
38978 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38979 VT.getVectorNumElements() < 4 ||
38980 !isPowerOf2_32(VT.getVectorNumElements()))
38983 SDValue N00 = N0.getOperand(0);
38984 SDValue N01 = N0.getOperand(1);
38985 SDValue N10 = N1.getOperand(0);
38986 SDValue N11 = N1.getOperand(1);
38988 // All inputs need to be sign extends.
38989 // TODO: Support ZERO_EXTEND from known positive?
38990 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
38991 N01.getOpcode() != ISD::SIGN_EXTEND ||
38992 N10.getOpcode() != ISD::SIGN_EXTEND ||
38993 N11.getOpcode() != ISD::SIGN_EXTEND)
38996 // Peek through the extends.
38997 N00 = N00.getOperand(0);
38998 N01 = N01.getOperand(0);
38999 N10 = N10.getOperand(0);
39000 N11 = N11.getOperand(0);
39002 // Must be extending from vXi16.
39003 EVT InVT = N00.getValueType();
39004 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
39005 N10.getValueType() != InVT || N11.getValueType() != InVT)
39008 // All inputs should be build_vectors.
39009 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
39010 N01.getOpcode() != ISD::BUILD_VECTOR ||
39011 N10.getOpcode() != ISD::BUILD_VECTOR ||
39012 N11.getOpcode() != ISD::BUILD_VECTOR)
39015 // For each element, we need to ensure we have an odd element from one vector
39016 // multiplied by the odd element of another vector and the even element from
39017 // one of the same vectors being multiplied by the even element from the
39018 // other vector. So we need to make sure for each element i, this operator
39019 // is being performed:
39020 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
39022 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
39023 SDValue N00Elt = N00.getOperand(i);
39024 SDValue N01Elt = N01.getOperand(i);
39025 SDValue N10Elt = N10.getOperand(i);
39026 SDValue N11Elt = N11.getOperand(i);
39027 // TODO: Be more tolerant to undefs.
39028 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39029 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39030 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
39031 N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
39033 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
39034 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
39035 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
39036 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
39037 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
39039 unsigned IdxN00 = ConstN00Elt->getZExtValue();
39040 unsigned IdxN01 = ConstN01Elt->getZExtValue();
39041 unsigned IdxN10 = ConstN10Elt->getZExtValue();
39042 unsigned IdxN11 = ConstN11Elt->getZExtValue();
39043 // Add is commutative so indices can be reordered.
39044 if (IdxN00 > IdxN10) {
39045 std::swap(IdxN00, IdxN10);
39046 std::swap(IdxN01, IdxN11);
39048 // N0 indices be the even element. N1 indices must be the next odd element.
39049 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
39050 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
39052 SDValue N00In = N00Elt.getOperand(0);
39053 SDValue N01In = N01Elt.getOperand(0);
39054 SDValue N10In = N10Elt.getOperand(0);
39055 SDValue N11In = N11Elt.getOperand(0);
39056 // First time we find an input capture it.
39061 // Mul is commutative so the input vectors can be in any order.
39062 // Canonicalize to make the compares easier.
39064 std::swap(N00In, N01In);
39066 std::swap(N10In, N11In);
39067 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
39071 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39072 ArrayRef<SDValue> Ops) {
39073 // Shrink by adding truncate nodes and let DAGCombine fold with the
39075 EVT InVT = Ops[0].getValueType();
39076 assert(InVT.getScalarType() == MVT::i16 &&
39077 "Unexpected scalar element type");
39078 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
39079 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
39080 InVT.getVectorNumElements() / 2);
39081 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
39083 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
39087 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
39088 const X86Subtarget &Subtarget) {
39089 const SDNodeFlags Flags = N->getFlags();
39090 if (Flags.hasVectorReduction()) {
39091 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
39093 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
39096 EVT VT = N->getValueType(0);
39097 SDValue Op0 = N->getOperand(0);
39098 SDValue Op1 = N->getOperand(1);
39100 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
39102 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
39105 // Try to synthesize horizontal adds from adds of shuffles.
39106 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
39107 VT == MVT::v8i32) &&
39108 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
39109 auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39110 ArrayRef<SDValue> Ops) {
39111 return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
39113 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
39117 if (SDValue V = combineIncDecVector(N, DAG))
39120 return combineAddOrSubToADCOrSBB(N, DAG);
39123 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
39124 const X86Subtarget &Subtarget) {
39125 SDValue Op0 = N->getOperand(0);
39126 SDValue Op1 = N->getOperand(1);
39127 EVT VT = N->getValueType(0);
39129 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
39130 // is only worth it with SSSE3 (PSHUFB).
39131 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
39132 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
39133 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
39134 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
39135 VT == MVT::v16i32 || VT == MVT::v8i64)))
39138 SDValue SubusLHS, SubusRHS;
39139 // Try to find umax(a,b) - b or a - umin(a,b) patterns
39140 // they may be converted to subus(a,b).
39141 // TODO: Need to add IR canonicalization for this code.
39142 if (Op0.getOpcode() == ISD::UMAX) {
39144 SDValue MaxLHS = Op0.getOperand(0);
39145 SDValue MaxRHS = Op0.getOperand(1);
39148 else if (MaxRHS == Op1)
39152 } else if (Op1.getOpcode() == ISD::UMIN) {
39154 SDValue MinLHS = Op1.getOperand(0);
39155 SDValue MinRHS = Op1.getOperand(1);
39158 else if (MinRHS == Op0)
39165 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39166 ArrayRef<SDValue> Ops) {
39167 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
39170 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
39171 // special preprocessing in some cases.
39172 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
39173 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
39174 { SubusLHS, SubusRHS }, SUBUSBuilder);
39176 // Special preprocessing case can be only applied
39177 // if the value was zero extended from 16 bit,
39178 // so we require first 16 bits to be zeros for 32 bit
39179 // values, or first 48 bits for 64 bit values.
39181 DAG.computeKnownBits(SubusLHS, Known);
39182 unsigned NumZeros = Known.countMinLeadingZeros();
39183 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
39186 EVT ExtType = SubusLHS.getValueType();
39188 if (VT == MVT::v8i32 || VT == MVT::v8i64)
39189 ShrinkedType = MVT::v8i16;
39191 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
39193 // If SubusLHS is zeroextended - truncate SubusRHS to it's
39194 // size SubusRHS = umin(0xFFF.., SubusRHS).
39195 SDValue SaturationConst =
39196 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
39197 ShrinkedType.getScalarSizeInBits()),
39198 SDLoc(SubusLHS), ExtType);
39199 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
39201 SDValue NewSubusLHS =
39202 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
39203 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
39205 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
39206 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
39207 // Zero extend the result, it may be used somewhere as 32 bit,
39208 // if not zext and following trunc will shrink.
39209 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
39212 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
39213 const X86Subtarget &Subtarget) {
39214 SDValue Op0 = N->getOperand(0);
39215 SDValue Op1 = N->getOperand(1);
39217 // X86 can't encode an immediate LHS of a sub. See if we can push the
39218 // negation into a preceding instruction.
39219 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
39220 // If the RHS of the sub is a XOR with one use and a constant, invert the
39221 // immediate. Then add one to the LHS of the sub so we can turn
39222 // X-Y -> X+~Y+1, saving one register.
39223 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
39224 isa<ConstantSDNode>(Op1.getOperand(1))) {
39225 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
39226 EVT VT = Op0.getValueType();
39227 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
39229 DAG.getConstant(~XorC, SDLoc(Op1), VT));
39230 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
39231 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
39235 // Try to synthesize horizontal subs from subs of shuffles.
39236 EVT VT = N->getValueType(0);
39237 if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
39238 VT == MVT::v8i32) &&
39239 Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
39240 auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
39241 ArrayRef<SDValue> Ops) {
39242 return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
39244 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
39248 if (SDValue V = combineIncDecVector(N, DAG))
39251 // Try to create PSUBUS if SUB's argument is max/min
39252 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
39255 return combineAddOrSubToADCOrSBB(N, DAG);
39258 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
39259 TargetLowering::DAGCombinerInfo &DCI,
39260 const X86Subtarget &Subtarget) {
39261 if (DCI.isBeforeLegalize())
39265 unsigned Opcode = N->getOpcode();
39266 MVT VT = N->getSimpleValueType(0);
39267 MVT SVT = VT.getVectorElementType();
39268 unsigned NumElts = VT.getVectorNumElements();
39269 unsigned EltSizeInBits = SVT.getSizeInBits();
39271 SDValue Op = N->getOperand(0);
39272 MVT OpVT = Op.getSimpleValueType();
39273 MVT OpEltVT = OpVT.getVectorElementType();
39274 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
39275 unsigned InputBits = OpEltSizeInBits * NumElts;
39277 // Perform any constant folding.
39278 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
39280 SmallVector<APInt, 64> EltBits;
39281 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
39282 APInt Undefs(NumElts, 0);
39283 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
39285 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
39286 for (unsigned i = 0; i != NumElts; ++i) {
39287 if (UndefElts[i]) {
39291 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
39292 : EltBits[i].sextOrTrunc(EltSizeInBits);
39294 return getConstVector(Vals, Undefs, VT, DAG, DL);
39297 // (vzext (bitcast (vzext (x)) -> (vzext x)
39298 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
39299 SDValue V = peekThroughBitcasts(Op);
39300 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
39301 MVT InnerVT = V.getSimpleValueType();
39302 MVT InnerEltVT = InnerVT.getVectorElementType();
39304 // If the element sizes match exactly, we can just do one larger vzext. This
39305 // is always an exact type match as vzext operates on integer types.
39306 if (OpEltVT == InnerEltVT) {
39307 assert(OpVT == InnerVT && "Types must match for vzext!");
39308 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
39311 // The only other way we can combine them is if only a single element of the
39312 // inner vzext is used in the input to the outer vzext.
39313 if (InnerEltVT.getSizeInBits() < InputBits)
39316 // In this case, the inner vzext is completely dead because we're going to
39317 // only look at bits inside of the low element. Just do the outer vzext on
39318 // a bitcast of the input to the inner.
39319 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
39322 // Check if we can bypass extracting and re-inserting an element of an input
39323 // vector. Essentially:
39324 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
39325 // TODO: Add X86ISD::VSEXT support
39326 if (Opcode == X86ISD::VZEXT &&
39327 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39328 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39329 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
39330 SDValue ExtractedV = V.getOperand(0);
39331 SDValue OrigV = ExtractedV.getOperand(0);
39332 if (isNullConstant(ExtractedV.getOperand(1))) {
39333 MVT OrigVT = OrigV.getSimpleValueType();
39334 // Extract a subvector if necessary...
39335 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
39336 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
39337 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
39338 OrigVT.getVectorNumElements() / Ratio);
39339 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
39340 DAG.getIntPtrConstant(0, DL));
39342 Op = DAG.getBitcast(OpVT, OrigV);
39343 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
39350 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
39351 const X86Subtarget &Subtarget) {
39352 MVT VT = N->getSimpleValueType(0);
39355 if (N->getOperand(0) == N->getOperand(1)) {
39356 if (N->getOpcode() == X86ISD::PCMPEQ)
39357 return getOnesVector(VT, DAG, DL);
39358 if (N->getOpcode() == X86ISD::PCMPGT)
39359 return getZeroVector(VT, Subtarget, DAG, DL);
39365 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
39366 TargetLowering::DAGCombinerInfo &DCI,
39367 const X86Subtarget &Subtarget) {
39368 if (DCI.isBeforeLegalizeOps())
39371 MVT OpVT = N->getSimpleValueType(0);
39373 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
39376 SDValue Vec = N->getOperand(0);
39377 SDValue SubVec = N->getOperand(1);
39379 unsigned IdxVal = N->getConstantOperandVal(2);
39380 MVT SubVecVT = SubVec.getSimpleValueType();
39382 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
39383 // Inserting zeros into zeros is a nop.
39384 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39385 return getZeroVector(OpVT, Subtarget, DAG, dl);
39387 // If we're inserting into a zero vector and then into a larger zero vector,
39388 // just insert into the larger zero vector directly.
39389 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39390 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
39391 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
39392 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39393 getZeroVector(OpVT, Subtarget, DAG, dl),
39394 SubVec.getOperand(1),
39395 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
39398 // If we're inserting into a zero vector and our input was extracted from an
39399 // insert into a zero vector of the same type and the extraction was at
39400 // least as large as the original insertion. Just insert the original
39401 // subvector into a zero vector.
39402 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
39403 SubVec.getConstantOperandVal(1) == 0 &&
39404 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
39405 SDValue Ins = SubVec.getOperand(0);
39406 if (Ins.getConstantOperandVal(2) == 0 &&
39407 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
39408 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
39409 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39410 getZeroVector(OpVT, Subtarget, DAG, dl),
39411 Ins.getOperand(1), N->getOperand(2));
39414 // If we're inserting a bitcast into zeros, rewrite the insert and move the
39415 // bitcast to the other side. This helps with detecting zero extending
39417 // TODO: Is this useful for other indices than 0?
39418 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
39419 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
39420 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
39421 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
39422 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
39423 DAG.getBitcast(NewVT, Vec),
39424 SubVec.getOperand(0), N->getOperand(2));
39425 return DAG.getBitcast(OpVT, Insert);
39429 // Stop here if this is an i1 vector.
39433 // If this is an insert of an extract, combine to a shuffle. Don't do this
39434 // if the insert or extract can be represented with a subregister operation.
39435 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39436 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
39437 (IdxVal != 0 || !Vec.isUndef())) {
39438 int ExtIdxVal = SubVec.getConstantOperandVal(1);
39439 if (ExtIdxVal != 0) {
39440 int VecNumElts = OpVT.getVectorNumElements();
39441 int SubVecNumElts = SubVecVT.getVectorNumElements();
39442 SmallVector<int, 64> Mask(VecNumElts);
39443 // First create an identity shuffle mask.
39444 for (int i = 0; i != VecNumElts; ++i)
39446 // Now insert the extracted portion.
39447 for (int i = 0; i != SubVecNumElts; ++i)
39448 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
39450 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
39454 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
39456 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39457 // (load16 addr + 16), Elts/2)
39460 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39461 // (load32 addr + 32), Elts/2)
39463 // or a 16-byte or 32-byte broadcast:
39464 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39465 // (load16 addr), Elts/2)
39466 // --> X86SubVBroadcast(load16 addr)
39468 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39469 // (load32 addr), Elts/2)
39470 // --> X86SubVBroadcast(load32 addr)
39471 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
39472 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39473 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
39474 if (isNullConstant(Vec.getOperand(2))) {
39475 SDValue SubVec2 = Vec.getOperand(1);
39476 // If needed, look through bitcasts to get to the load.
39477 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
39479 unsigned Alignment = FirstLd->getAlignment();
39480 unsigned AS = FirstLd->getAddressSpace();
39481 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
39482 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
39483 OpVT, AS, Alignment, &Fast) && Fast) {
39484 SDValue Ops[] = {SubVec2, SubVec};
39485 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
39490 // If lower/upper loads are the same and the only users of the load, then
39491 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
39492 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
39493 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
39494 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
39495 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
39497 // If this is subv_broadcast insert into both halves, use a larger
39499 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
39500 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
39501 SubVec.getOperand(0));
39503 // If we're inserting all zeros into the upper half, change this to
39504 // an insert into an all zeros vector. We will match this to a move
39505 // with implicit upper bit zeroing during isel.
39506 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39507 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39508 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
39509 Vec.getOperand(2));
39511 // If we are inserting into both halves of the vector, the starting
39512 // vector should be undef. If it isn't, make it so. Only do this if the
39513 // the early insert has no other uses.
39514 // TODO: Should this be a generic DAG combine?
39515 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
39516 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
39517 SubVec2, Vec.getOperand(2));
39518 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
39528 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
39529 TargetLowering::DAGCombinerInfo &DCI,
39530 const X86Subtarget &Subtarget) {
39531 if (DCI.isBeforeLegalizeOps())
39534 MVT OpVT = N->getSimpleValueType(0);
39535 SDValue InVec = N->getOperand(0);
39536 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
39538 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
39539 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
39541 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
39542 if (OpVT.getScalarType() == MVT::i1)
39543 return DAG.getConstant(1, SDLoc(N), OpVT);
39544 return getOnesVector(OpVT, DAG, SDLoc(N));
39547 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
39548 return DAG.getBuildVector(
39550 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
39552 // If we're extracting the lowest subvector and we're the only user,
39553 // we may be able to perform this with a smaller vector width.
39554 if (IdxVal == 0 && InVec.hasOneUse()) {
39555 unsigned InOpcode = InVec.getOpcode();
39556 if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
39557 // v2f64 CVTDQ2PD(v4i32).
39558 if (InOpcode == ISD::SINT_TO_FP &&
39559 InVec.getOperand(0).getValueType() == MVT::v4i32) {
39560 return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
39562 // v2f64 CVTPS2PD(v4f32).
39563 if (InOpcode == ISD::FP_EXTEND &&
39564 InVec.getOperand(0).getValueType() == MVT::v4f32) {
39565 return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
39568 if ((InOpcode == X86ISD::VZEXT || InOpcode == X86ISD::VSEXT) &&
39569 OpVT.is128BitVector() &&
39570 InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
39571 unsigned ExtOp = InOpcode == X86ISD::VZEXT ? ISD::ZERO_EXTEND_VECTOR_INREG
39572 : ISD::SIGN_EXTEND_VECTOR_INREG;
39573 return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
39580 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
39581 EVT VT = N->getValueType(0);
39582 SDValue Src = N->getOperand(0);
39584 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
39585 // This occurs frequently in our masked scalar intrinsic code and our
39586 // floating point select lowering with AVX512.
39587 // TODO: SimplifyDemandedBits instead?
39588 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
39589 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39590 if (C->getAPIntValue().isOneValue())
39591 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39592 Src.getOperand(0));
39597 // Simplify PMULDQ and PMULUDQ operations.
39598 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
39599 TargetLowering::DAGCombinerInfo &DCI) {
39600 SDValue LHS = N->getOperand(0);
39601 SDValue RHS = N->getOperand(1);
39603 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39604 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
39605 !DCI.isBeforeLegalizeOps());
39606 APInt DemandedMask(APInt::getLowBitsSet(64, 32));
39608 // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
39609 KnownBits LHSKnown;
39610 if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
39611 DCI.CommitTargetLoweringOpt(TLO);
39612 return SDValue(N, 0);
39615 KnownBits RHSKnown;
39616 if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
39617 DCI.CommitTargetLoweringOpt(TLO);
39618 return SDValue(N, 0);
39624 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
39625 DAGCombinerInfo &DCI) const {
39626 SelectionDAG &DAG = DCI.DAG;
39627 switch (N->getOpcode()) {
39629 case ISD::SCALAR_TO_VECTOR:
39630 return combineScalarToVector(N, DAG);
39631 case ISD::EXTRACT_VECTOR_ELT:
39632 case X86ISD::PEXTRW:
39633 case X86ISD::PEXTRB:
39634 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
39635 case ISD::INSERT_SUBVECTOR:
39636 return combineInsertSubvector(N, DAG, DCI, Subtarget);
39637 case ISD::EXTRACT_SUBVECTOR:
39638 return combineExtractSubvector(N, DAG, DCI, Subtarget);
39641 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
39642 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
39643 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
39644 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
39645 case ISD::SUB: return combineSub(N, DAG, Subtarget);
39646 case X86ISD::SBB: return combineSBB(N, DAG);
39647 case X86ISD::ADC: return combineADC(N, DAG, DCI);
39648 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
39651 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
39652 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
39653 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
39654 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
39655 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
39656 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
39657 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
39658 case ISD::STORE: return combineStore(N, DAG, Subtarget);
39659 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
39660 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
39661 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
39663 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
39664 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
39665 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
39666 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
39667 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
39668 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
39670 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
39672 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
39674 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
39675 case X86ISD::BT: return combineBT(N, DAG, DCI);
39676 case ISD::ANY_EXTEND:
39677 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
39678 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
39679 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
39680 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
39681 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
39682 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
39683 case X86ISD::PACKSS:
39684 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
39685 case X86ISD::VSHLI:
39686 case X86ISD::VSRAI:
39687 case X86ISD::VSRLI:
39688 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
39689 case ISD::SIGN_EXTEND_VECTOR_INREG:
39690 case ISD::ZERO_EXTEND_VECTOR_INREG:
39691 case X86ISD::VSEXT:
39692 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
39693 case X86ISD::PINSRB:
39694 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
39695 case X86ISD::SHUFP: // Handle all target specific shuffles
39696 case X86ISD::INSERTPS:
39697 case X86ISD::EXTRQI:
39698 case X86ISD::INSERTQI:
39699 case X86ISD::PALIGNR:
39700 case X86ISD::VSHLDQ:
39701 case X86ISD::VSRLDQ:
39702 case X86ISD::BLENDI:
39703 case X86ISD::UNPCKH:
39704 case X86ISD::UNPCKL:
39705 case X86ISD::MOVHLPS:
39706 case X86ISD::MOVLHPS:
39707 case X86ISD::PSHUFB:
39708 case X86ISD::PSHUFD:
39709 case X86ISD::PSHUFHW:
39710 case X86ISD::PSHUFLW:
39711 case X86ISD::MOVSHDUP:
39712 case X86ISD::MOVSLDUP:
39713 case X86ISD::MOVDDUP:
39714 case X86ISD::MOVSS:
39715 case X86ISD::MOVSD:
39716 case X86ISD::VBROADCAST:
39717 case X86ISD::VPPERM:
39718 case X86ISD::VPERMI:
39719 case X86ISD::VPERMV:
39720 case X86ISD::VPERMV3:
39721 case X86ISD::VPERMIL2:
39722 case X86ISD::VPERMILPI:
39723 case X86ISD::VPERMILPV:
39724 case X86ISD::VPERM2X128:
39725 case X86ISD::SHUF128:
39726 case X86ISD::VZEXT_MOVL:
39727 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
39728 case X86ISD::FMADD_RND:
39729 case X86ISD::FMSUB:
39730 case X86ISD::FMSUB_RND:
39731 case X86ISD::FNMADD:
39732 case X86ISD::FNMADD_RND:
39733 case X86ISD::FNMSUB:
39734 case X86ISD::FNMSUB_RND:
39735 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
39736 case X86ISD::FMADDSUB_RND:
39737 case X86ISD::FMSUBADD_RND:
39738 case X86ISD::FMADDSUB:
39739 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
39740 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
39741 case X86ISD::MGATHER:
39742 case X86ISD::MSCATTER:
39744 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
39745 case X86ISD::PCMPEQ:
39746 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
39747 case X86ISD::PMULDQ:
39748 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
39754 /// Return true if the target has native support for the specified value type
39755 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
39756 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
39757 /// some i16 instructions are slow.
39758 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
39759 if (!isTypeLegal(VT))
39762 // There are no vXi8 shifts.
39763 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
39766 if (VT != MVT::i16)
39773 case ISD::SIGN_EXTEND:
39774 case ISD::ZERO_EXTEND:
39775 case ISD::ANY_EXTEND:
39788 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
39789 SDValue Value, SDValue Addr,
39790 SelectionDAG &DAG) const {
39791 const Module *M = DAG.getMachineFunction().getMMI().getModule();
39792 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
39793 if (IsCFProtectionSupported) {
39794 // In case control-flow branch protection is enabled, we need to add
39795 // notrack prefix to the indirect branch.
39796 // In order to do that we create NT_BRIND SDNode.
39797 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
39798 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
39801 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
39804 /// This method query the target whether it is beneficial for dag combiner to
39805 /// promote the specified node. If true, it should return the desired promotion
39806 /// type by reference.
39807 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
39808 EVT VT = Op.getValueType();
39809 if (VT != MVT::i16)
39812 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
39813 if (!Op.hasOneUse())
39815 SDNode *User = *Op->use_begin();
39816 if (!ISD::isNormalStore(User))
39818 auto *Ld = cast<LoadSDNode>(Load);
39819 auto *St = cast<StoreSDNode>(User);
39820 return Ld->getBasePtr() == St->getBasePtr();
39823 bool Commute = false;
39824 switch (Op.getOpcode()) {
39825 default: return false;
39826 case ISD::SIGN_EXTEND:
39827 case ISD::ZERO_EXTEND:
39828 case ISD::ANY_EXTEND:
39832 SDValue N0 = Op.getOperand(0);
39833 // Look out for (store (shl (load), x)).
39834 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
39846 SDValue N0 = Op.getOperand(0);
39847 SDValue N1 = Op.getOperand(1);
39848 // Avoid disabling potential load folding opportunities.
39849 if (MayFoldLoad(N1) &&
39850 (!Commute || !isa<ConstantSDNode>(N0) ||
39851 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
39853 if (MayFoldLoad(N0) &&
39854 ((Commute && !isa<ConstantSDNode>(N1)) ||
39855 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
39864 bool X86TargetLowering::
39865 isDesirableToCombineBuildVectorToShuffleTruncate(
39866 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
39868 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
39869 "Element count mismatch");
39871 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
39872 "Shuffle Mask expected to be legal");
39874 // For 32-bit elements VPERMD is better than shuffle+truncate.
39875 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
39876 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
39879 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
39885 //===----------------------------------------------------------------------===//
39886 // X86 Inline Assembly Support
39887 //===----------------------------------------------------------------------===//
39889 // Helper to match a string separated by whitespace.
39890 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
39891 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
39893 for (StringRef Piece : Pieces) {
39894 if (!S.startswith(Piece)) // Check if the piece matches.
39897 S = S.substr(Piece.size());
39898 StringRef::size_type Pos = S.find_first_not_of(" \t");
39899 if (Pos == 0) // We matched a prefix.
39908 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
39910 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
39911 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
39912 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
39913 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
39915 if (AsmPieces.size() == 3)
39917 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
39924 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
39925 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
39927 const std::string &AsmStr = IA->getAsmString();
39929 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
39930 if (!Ty || Ty->getBitWidth() % 16 != 0)
39933 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
39934 SmallVector<StringRef, 4> AsmPieces;
39935 SplitString(AsmStr, AsmPieces, ";\n");
39937 switch (AsmPieces.size()) {
39938 default: return false;
39940 // FIXME: this should verify that we are targeting a 486 or better. If not,
39941 // we will turn this bswap into something that will be lowered to logical
39942 // ops instead of emitting the bswap asm. For now, we don't support 486 or
39943 // lower so don't worry about this.
39945 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
39946 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
39947 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
39948 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
39949 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
39950 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
39951 // No need to check constraints, nothing other than the equivalent of
39952 // "=r,0" would be valid here.
39953 return IntrinsicLowering::LowerToByteSwap(CI);
39956 // rorw $$8, ${0:w} --> llvm.bswap.i16
39957 if (CI->getType()->isIntegerTy(16) &&
39958 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39959 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
39960 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
39962 StringRef ConstraintsStr = IA->getConstraintString();
39963 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39964 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39965 if (clobbersFlagRegisters(AsmPieces))
39966 return IntrinsicLowering::LowerToByteSwap(CI);
39970 if (CI->getType()->isIntegerTy(32) &&
39971 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39972 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
39973 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
39974 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
39976 StringRef ConstraintsStr = IA->getConstraintString();
39977 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39978 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39979 if (clobbersFlagRegisters(AsmPieces))
39980 return IntrinsicLowering::LowerToByteSwap(CI);
39983 if (CI->getType()->isIntegerTy(64)) {
39984 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
39985 if (Constraints.size() >= 2 &&
39986 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
39987 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
39988 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
39989 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
39990 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
39991 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
39992 return IntrinsicLowering::LowerToByteSwap(CI);
40000 /// Given a constraint letter, return the type of constraint for this target.
40001 X86TargetLowering::ConstraintType
40002 X86TargetLowering::getConstraintType(StringRef Constraint) const {
40003 if (Constraint.size() == 1) {
40004 switch (Constraint[0]) {
40016 case 'k': // AVX512 masking registers.
40017 return C_RegisterClass;
40041 else if (Constraint.size() == 2) {
40042 switch (Constraint[0]) {
40046 switch (Constraint[1]) {
40057 return C_RegisterClass;
40061 return TargetLowering::getConstraintType(Constraint);
40064 /// Examine constraint type and operand type and determine a weight value.
40065 /// This object must already have been set up with the operand type
40066 /// and the current alternative constraint selected.
40067 TargetLowering::ConstraintWeight
40068 X86TargetLowering::getSingleConstraintMatchWeight(
40069 AsmOperandInfo &info, const char *constraint) const {
40070 ConstraintWeight weight = CW_Invalid;
40071 Value *CallOperandVal = info.CallOperandVal;
40072 // If we don't have a value, we can't do a match,
40073 // but allow it at the lowest weight.
40074 if (!CallOperandVal)
40076 Type *type = CallOperandVal->getType();
40077 // Look at the constraint type.
40078 switch (*constraint) {
40080 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
40092 if (CallOperandVal->getType()->isIntegerTy())
40093 weight = CW_SpecificReg;
40098 if (type->isFloatingPointTy())
40099 weight = CW_SpecificReg;
40102 if (type->isX86_MMXTy() && Subtarget.hasMMX())
40103 weight = CW_SpecificReg;
40106 unsigned Size = StringRef(constraint).size();
40107 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
40108 char NextChar = Size == 2 ? constraint[1] : 'i';
40111 switch (NextChar) {
40117 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
40118 return CW_SpecificReg;
40120 // Conditional OpMask regs (AVX512)
40122 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
40123 return CW_Register;
40127 if (type->isX86_MMXTy() && Subtarget.hasMMX())
40130 // Any SSE reg when ISA >= SSE2, same as 'Y'
40134 if (!Subtarget.hasSSE2())
40138 // Fall through (handle "Y" constraint).
40142 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
40143 weight = CW_Register;
40146 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
40147 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
40148 weight = CW_Register;
40151 // Enable conditional vector operations using %k<#> registers.
40152 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
40153 weight = CW_Register;
40156 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
40157 if (C->getZExtValue() <= 31)
40158 weight = CW_Constant;
40162 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40163 if (C->getZExtValue() <= 63)
40164 weight = CW_Constant;
40168 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40169 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
40170 weight = CW_Constant;
40174 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40175 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
40176 weight = CW_Constant;
40180 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40181 if (C->getZExtValue() <= 3)
40182 weight = CW_Constant;
40186 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40187 if (C->getZExtValue() <= 0xff)
40188 weight = CW_Constant;
40193 if (isa<ConstantFP>(CallOperandVal)) {
40194 weight = CW_Constant;
40198 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40199 if ((C->getSExtValue() >= -0x80000000LL) &&
40200 (C->getSExtValue() <= 0x7fffffffLL))
40201 weight = CW_Constant;
40205 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
40206 if (C->getZExtValue() <= 0xffffffff)
40207 weight = CW_Constant;
40214 /// Try to replace an X constraint, which matches anything, with another that
40215 /// has more specific requirements based on the type of the corresponding
40217 const char *X86TargetLowering::
40218 LowerXConstraint(EVT ConstraintVT) const {
40219 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
40220 // 'f' like normal targets.
40221 if (ConstraintVT.isFloatingPoint()) {
40222 if (Subtarget.hasSSE2())
40224 if (Subtarget.hasSSE1())
40228 return TargetLowering::LowerXConstraint(ConstraintVT);
40231 /// Lower the specified operand into the Ops vector.
40232 /// If it is invalid, don't add anything to Ops.
40233 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
40234 std::string &Constraint,
40235 std::vector<SDValue>&Ops,
40236 SelectionDAG &DAG) const {
40239 // Only support length 1 constraints for now.
40240 if (Constraint.length() > 1) return;
40242 char ConstraintLetter = Constraint[0];
40243 switch (ConstraintLetter) {
40246 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40247 if (C->getZExtValue() <= 31) {
40248 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40249 Op.getValueType());
40255 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40256 if (C->getZExtValue() <= 63) {
40257 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40258 Op.getValueType());
40264 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40265 if (isInt<8>(C->getSExtValue())) {
40266 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40267 Op.getValueType());
40273 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40274 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
40275 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
40276 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
40277 Op.getValueType());
40283 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40284 if (C->getZExtValue() <= 3) {
40285 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40286 Op.getValueType());
40292 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40293 if (C->getZExtValue() <= 255) {
40294 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40295 Op.getValueType());
40301 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40302 if (C->getZExtValue() <= 127) {
40303 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40304 Op.getValueType());
40310 // 32-bit signed value
40311 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40312 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40313 C->getSExtValue())) {
40314 // Widen to 64 bits here to get it sign extended.
40315 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
40318 // FIXME gcc accepts some relocatable values here too, but only in certain
40319 // memory models; it's complicated.
40324 // 32-bit unsigned value
40325 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40326 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40327 C->getZExtValue())) {
40328 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40329 Op.getValueType());
40333 // FIXME gcc accepts some relocatable values here too, but only in certain
40334 // memory models; it's complicated.
40338 // Literal immediates are always ok.
40339 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
40340 // Widen to 64 bits here to get it sign extended.
40341 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
40345 // In any sort of PIC mode addresses need to be computed at runtime by
40346 // adding in a register or some sort of table lookup. These can't
40347 // be used as immediates.
40348 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
40351 // If we are in non-pic codegen mode, we allow the address of a global (with
40352 // an optional displacement) to be used with 'i'.
40353 GlobalAddressSDNode *GA = nullptr;
40354 int64_t Offset = 0;
40356 // Match either (GA), (GA+C), (GA+C1+C2), etc.
40358 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
40359 Offset += GA->getOffset();
40361 } else if (Op.getOpcode() == ISD::ADD) {
40362 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40363 Offset += C->getZExtValue();
40364 Op = Op.getOperand(0);
40367 } else if (Op.getOpcode() == ISD::SUB) {
40368 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40369 Offset += -C->getZExtValue();
40370 Op = Op.getOperand(0);
40375 // Otherwise, this isn't something we can handle, reject it.
40379 const GlobalValue *GV = GA->getGlobal();
40380 // If we require an extra load to get this address, as in PIC mode, we
40381 // can't accept it.
40382 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
40385 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
40386 GA->getValueType(0), Offset);
40391 if (Result.getNode()) {
40392 Ops.push_back(Result);
40395 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
40398 /// Check if \p RC is a general purpose register class.
40399 /// I.e., GR* or one of their variant.
40400 static bool isGRClass(const TargetRegisterClass &RC) {
40401 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
40402 RC.hasSuperClassEq(&X86::GR16RegClass) ||
40403 RC.hasSuperClassEq(&X86::GR32RegClass) ||
40404 RC.hasSuperClassEq(&X86::GR64RegClass) ||
40405 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
40408 /// Check if \p RC is a vector register class.
40409 /// I.e., FR* / VR* or one of their variant.
40410 static bool isFRClass(const TargetRegisterClass &RC) {
40411 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
40412 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
40413 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
40414 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
40415 RC.hasSuperClassEq(&X86::VR512RegClass);
40418 std::pair<unsigned, const TargetRegisterClass *>
40419 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
40420 StringRef Constraint,
40422 // First, see if this is a constraint that directly corresponds to an LLVM
40424 if (Constraint.size() == 1) {
40425 // GCC Constraint Letters
40426 switch (Constraint[0]) {
40428 // TODO: Slight differences here in allocation order and leaving
40429 // RIP in the class. Do they matter any more here than they do
40430 // in the normal allocation?
40432 if (Subtarget.hasAVX512()) {
40433 // Only supported in AVX512 or later.
40434 switch (VT.SimpleTy) {
40437 return std::make_pair(0U, &X86::VK32RegClass);
40439 return std::make_pair(0U, &X86::VK16RegClass);
40441 return std::make_pair(0U, &X86::VK8RegClass);
40443 return std::make_pair(0U, &X86::VK1RegClass);
40445 return std::make_pair(0U, &X86::VK64RegClass);
40449 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
40450 if (Subtarget.is64Bit()) {
40451 if (VT == MVT::i32 || VT == MVT::f32)
40452 return std::make_pair(0U, &X86::GR32RegClass);
40453 if (VT == MVT::i16)
40454 return std::make_pair(0U, &X86::GR16RegClass);
40455 if (VT == MVT::i8 || VT == MVT::i1)
40456 return std::make_pair(0U, &X86::GR8RegClass);
40457 if (VT == MVT::i64 || VT == MVT::f64)
40458 return std::make_pair(0U, &X86::GR64RegClass);
40462 // 32-bit fallthrough
40463 case 'Q': // Q_REGS
40464 if (VT == MVT::i32 || VT == MVT::f32)
40465 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
40466 if (VT == MVT::i16)
40467 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
40468 if (VT == MVT::i8 || VT == MVT::i1)
40469 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
40470 if (VT == MVT::i64)
40471 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
40473 case 'r': // GENERAL_REGS
40474 case 'l': // INDEX_REGS
40475 if (VT == MVT::i8 || VT == MVT::i1)
40476 return std::make_pair(0U, &X86::GR8RegClass);
40477 if (VT == MVT::i16)
40478 return std::make_pair(0U, &X86::GR16RegClass);
40479 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
40480 return std::make_pair(0U, &X86::GR32RegClass);
40481 return std::make_pair(0U, &X86::GR64RegClass);
40482 case 'R': // LEGACY_REGS
40483 if (VT == MVT::i8 || VT == MVT::i1)
40484 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
40485 if (VT == MVT::i16)
40486 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
40487 if (VT == MVT::i32 || !Subtarget.is64Bit())
40488 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
40489 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
40490 case 'f': // FP Stack registers.
40491 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
40492 // value to the correct fpstack register class.
40493 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
40494 return std::make_pair(0U, &X86::RFP32RegClass);
40495 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
40496 return std::make_pair(0U, &X86::RFP64RegClass);
40497 return std::make_pair(0U, &X86::RFP80RegClass);
40498 case 'y': // MMX_REGS if MMX allowed.
40499 if (!Subtarget.hasMMX()) break;
40500 return std::make_pair(0U, &X86::VR64RegClass);
40501 case 'Y': // SSE_REGS if SSE2 allowed
40502 if (!Subtarget.hasSSE2()) break;
40505 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
40506 if (!Subtarget.hasSSE1()) break;
40507 bool VConstraint = (Constraint[0] == 'v');
40509 switch (VT.SimpleTy) {
40511 // Scalar SSE types.
40514 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
40515 return std::make_pair(0U, &X86::FR32XRegClass);
40516 return std::make_pair(0U, &X86::FR32RegClass);
40519 if (VConstraint && Subtarget.hasVLX())
40520 return std::make_pair(0U, &X86::FR64XRegClass);
40521 return std::make_pair(0U, &X86::FR64RegClass);
40522 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40530 if (VConstraint && Subtarget.hasVLX())
40531 return std::make_pair(0U, &X86::VR128XRegClass);
40532 return std::make_pair(0U, &X86::VR128RegClass);
40540 if (VConstraint && Subtarget.hasVLX())
40541 return std::make_pair(0U, &X86::VR256XRegClass);
40542 return std::make_pair(0U, &X86::VR256RegClass);
40547 return std::make_pair(0U, &X86::VR512RegClass);
40551 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
40552 switch (Constraint[1]) {
40558 return getRegForInlineAsmConstraint(TRI, "Y", VT);
40560 if (!Subtarget.hasMMX()) break;
40561 return std::make_pair(0U, &X86::VR64RegClass);
40564 if (!Subtarget.hasSSE1()) break;
40565 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
40567 // This register class doesn't allocate k0 for masked vector operation.
40568 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
40569 switch (VT.SimpleTy) {
40572 return std::make_pair(0U, &X86::VK32WMRegClass);
40574 return std::make_pair(0U, &X86::VK16WMRegClass);
40576 return std::make_pair(0U, &X86::VK8WMRegClass);
40578 return std::make_pair(0U, &X86::VK1WMRegClass);
40580 return std::make_pair(0U, &X86::VK64WMRegClass);
40587 // Use the default implementation in TargetLowering to convert the register
40588 // constraint into a member of a register class.
40589 std::pair<unsigned, const TargetRegisterClass*> Res;
40590 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
40592 // Not found as a standard register?
40594 // Map st(0) -> st(7) -> ST0
40595 if (Constraint.size() == 7 && Constraint[0] == '{' &&
40596 tolower(Constraint[1]) == 's' &&
40597 tolower(Constraint[2]) == 't' &&
40598 Constraint[3] == '(' &&
40599 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
40600 Constraint[5] == ')' &&
40601 Constraint[6] == '}') {
40603 Res.first = X86::FP0+Constraint[4]-'0';
40604 Res.second = &X86::RFP80RegClass;
40608 // GCC allows "st(0)" to be called just plain "st".
40609 if (StringRef("{st}").equals_lower(Constraint)) {
40610 Res.first = X86::FP0;
40611 Res.second = &X86::RFP80RegClass;
40616 if (StringRef("{flags}").equals_lower(Constraint)) {
40617 Res.first = X86::EFLAGS;
40618 Res.second = &X86::CCRRegClass;
40622 // 'A' means [ER]AX + [ER]DX.
40623 if (Constraint == "A") {
40624 if (Subtarget.is64Bit()) {
40625 Res.first = X86::RAX;
40626 Res.second = &X86::GR64_ADRegClass;
40628 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
40629 "Expecting 64, 32 or 16 bit subtarget");
40630 Res.first = X86::EAX;
40631 Res.second = &X86::GR32_ADRegClass;
40638 // Make sure it isn't a register that requires 64-bit mode.
40639 if (!Subtarget.is64Bit() &&
40640 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
40641 TRI->getEncodingValue(Res.first) >= 8) {
40642 // Register requires REX prefix, but we're in 32-bit mode.
40644 Res.second = nullptr;
40648 // Make sure it isn't a register that requires AVX512.
40649 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
40650 TRI->getEncodingValue(Res.first) & 0x10) {
40651 // Register requires EVEX prefix.
40653 Res.second = nullptr;
40657 // Otherwise, check to see if this is a register class of the wrong value
40658 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
40659 // turn into {ax},{dx}.
40660 // MVT::Other is used to specify clobber names.
40661 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
40662 return Res; // Correct type already, nothing to do.
40664 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
40665 // return "eax". This should even work for things like getting 64bit integer
40666 // registers when given an f64 type.
40667 const TargetRegisterClass *Class = Res.second;
40668 // The generic code will match the first register class that contains the
40669 // given register. Thus, based on the ordering of the tablegened file,
40670 // the "plain" GR classes might not come first.
40671 // Therefore, use a helper method.
40672 if (isGRClass(*Class)) {
40673 unsigned Size = VT.getSizeInBits();
40674 if (Size == 1) Size = 8;
40675 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
40677 bool is64Bit = Subtarget.is64Bit();
40678 const TargetRegisterClass *RC =
40679 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
40680 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
40681 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
40682 : &X86::GR64RegClass;
40683 if (RC->contains(DestReg))
40684 Res = std::make_pair(DestReg, RC);
40686 // No register found/type mismatch.
40688 Res.second = nullptr;
40690 } else if (isFRClass(*Class)) {
40691 // Handle references to XMM physical registers that got mapped into the
40692 // wrong class. This can happen with constraints like {xmm0} where the
40693 // target independent register mapper will just pick the first match it can
40694 // find, ignoring the required type.
40696 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40697 if (VT == MVT::f32 || VT == MVT::i32)
40698 Res.second = &X86::FR32RegClass;
40699 else if (VT == MVT::f64 || VT == MVT::i64)
40700 Res.second = &X86::FR64RegClass;
40701 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
40702 Res.second = &X86::VR128RegClass;
40703 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
40704 Res.second = &X86::VR256RegClass;
40705 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
40706 Res.second = &X86::VR512RegClass;
40708 // Type mismatch and not a clobber: Return an error;
40710 Res.second = nullptr;
40717 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
40718 const AddrMode &AM, Type *Ty,
40719 unsigned AS) const {
40720 // Scaling factors are not free at all.
40721 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
40722 // will take 2 allocations in the out of order engine instead of 1
40723 // for plain addressing mode, i.e. inst (reg1).
40725 // vaddps (%rsi,%rdx), %ymm0, %ymm1
40726 // Requires two allocations (one for the load, one for the computation)
40728 // vaddps (%rsi), %ymm0, %ymm1
40729 // Requires just 1 allocation, i.e., freeing allocations for other operations
40730 // and having less micro operations to execute.
40732 // For some X86 architectures, this is even worse because for instance for
40733 // stores, the complex addressing mode forces the instruction to use the
40734 // "load" ports instead of the dedicated "store" port.
40735 // E.g., on Haswell:
40736 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
40737 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
40738 if (isLegalAddressingMode(DL, AM, Ty, AS))
40739 // Scale represents reg2 * scale, thus account for 1
40740 // as soon as we use a second register.
40741 return AM.Scale != 0;
40745 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
40746 // Integer division on x86 is expensive. However, when aggressively optimizing
40747 // for code size, we prefer to use a div instruction, as it is usually smaller
40748 // than the alternative sequence.
40749 // The exception to this is vector division. Since x86 doesn't have vector
40750 // integer division, leaving the division as-is is a loss even in terms of
40751 // size, because it will have to be scalarized, while the alternative code
40752 // sequence can be performed in vector form.
40754 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
40755 return OptSize && !VT.isVector();
40758 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
40759 if (!Subtarget.is64Bit())
40762 // Update IsSplitCSR in X86MachineFunctionInfo.
40763 X86MachineFunctionInfo *AFI =
40764 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
40765 AFI->setIsSplitCSR(true);
40768 void X86TargetLowering::insertCopiesSplitCSR(
40769 MachineBasicBlock *Entry,
40770 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
40771 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
40772 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
40776 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
40777 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
40778 MachineBasicBlock::iterator MBBI = Entry->begin();
40779 for (const MCPhysReg *I = IStart; *I; ++I) {
40780 const TargetRegisterClass *RC = nullptr;
40781 if (X86::GR64RegClass.contains(*I))
40782 RC = &X86::GR64RegClass;
40784 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
40786 unsigned NewVR = MRI->createVirtualRegister(RC);
40787 // Create copy from CSR to a virtual register.
40788 // FIXME: this currently does not emit CFI pseudo-instructions, it works
40789 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
40790 // nounwind. If we want to generalize this later, we may need to emit
40791 // CFI pseudo-instructions.
40792 assert(Entry->getParent()->getFunction().hasFnAttribute(
40793 Attribute::NoUnwind) &&
40794 "Function should be nounwind in insertCopiesSplitCSR!");
40795 Entry->addLiveIn(*I);
40796 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
40799 // Insert the copy-back instructions right before the terminator.
40800 for (auto *Exit : Exits)
40801 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
40802 TII->get(TargetOpcode::COPY), *I)
40807 bool X86TargetLowering::supportSwiftError() const {
40808 return Subtarget.is64Bit();
40811 /// Returns the name of the symbol used to emit stack probes or the empty
40812 /// string if not applicable.
40813 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
40814 // If the function specifically requests stack probes, emit them.
40815 if (MF.getFunction().hasFnAttribute("probe-stack"))
40816 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
40818 // Generally, if we aren't on Windows, the platform ABI does not include
40819 // support for stack probes, so don't emit them.
40820 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
40821 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
40824 // We need a stack probe to conform to the Windows ABI. Choose the right
40826 if (Subtarget.is64Bit())
40827 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
40828 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";